diff --git a/README.md b/README.md index 3685991c..a21fac4e 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ Allow dynamic-group instance_principal to manage compute-management-family in co Allow dynamic-group instance_principal to manage instance-family in compartment compartmentName Allow dynamic-group instance_principal to use virtual-network-family in compartment compartmentName Allow dynamic-group instance_principal to use volumes in compartment compartmentName +Allow dynamic-group instance_principal to manage dns in compartment compartmentName ``` or: @@ -34,12 +35,9 @@ The stack allowa various combination of OS. Here is a list of what has been test | Controller | Compute | |---------------|--------------| -| OL7 | OL7 | -| OL7 | OL8 | -| OL7 | CentOS7 | -| OL8 | OL8 | -| OL8 | OL7 | -| Ubuntu 20.04 | Ubuntu 20.04 | +| OL8 | OL8 | +| OL8 | OL7 | +| Ubuntu 22.04 | Ubuntu 22.04 | When switching to Ubuntu, make sure the username is changed from opc to Ubuntu in the ORM for both the controller and compute nodes. ## How is resizing different from autoscaling ? @@ -276,10 +274,6 @@ Example: ``` /opt/oci-hpc/bin/create_cluster.sh 4 compute2-1-hpc HPC_instance compute2 ``` -The name of the cluster must be -queueName-clusterNumber-instanceType_keyword - -The keyword will need to match the one from /opt/oci-hpc/conf/queues.conf to be registered in Slurm ### Cluster Deletion: ``` @@ -422,3 +416,14 @@ By default, this check box is enabled. By selecting, this check-box, a PAR would Step 2: Use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage. User needs to use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage. User could configure metrics collection limit and interval through config file: rdma_metrics_collection_config.conf. + +## Meshpinger + +Meshpinger is a tool for validating network layer connectivity between RDMA NICs on a cluster network in OCI. The tool is capable of initiating ICMP ping from every RDMA NIC port on the cluster network to every other RDMA NIC port on the same cluster network and +reporting back the success/failure status of the pings performed in the form of logs + +Running the tool before starting workload on a cluster network should serve as a good precheck step to gain confidence on the network reachability between RDMA NICs. Typical causes for reachability failures that the tool can help pinpoint are, +1. Link down on the RDMA NIC +2. RDMA interface initialization or configuration issues including IP address assignment to +the interface +3. Insufficient ARP table size on the node to store all needed peer mac addresses \ No newline at end of file diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh index 8ccde4ef..ae6054db 100755 --- a/autoscaling/crontab/autoscale_slurm.sh +++ b/autoscaling/crontab/autoscale_slurm.sh @@ -111,12 +111,12 @@ def getDefaultsConfig(config,queue_name): for instance_type in partition["instance_types"]: if "default" in instance_type.keys(): if instance_type["default"]: - return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "instance_keyword":instance_type["instance_keyword"]} + return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "hostname_convention":instance_type["hostname_convention"]} if len(partition["instance_types"])>0: instance_type=partition["instance_types"][0] print ("No default configuration was found, there may be a problem in your queues.conf file") print ("Selecting "+instance_type["name"]+" as default") - return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "instance_keyword":instance_type["instance_keyword"]} + return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "hostname_convention":instance_type["hostname_convention"]} print ("The queue "+queue_name+" was not found in the queues.conf file") return None @@ -125,7 +125,7 @@ def getJobConfig(config,queue_name,instance_type_name): if queue_name == partition["name"]: for instance_type in partition["instance_types"]: if instance_type_name == instance_type["name"]: - return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "instance_keyword":instance_type["instance_keyword"]} + return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "hostname_convention":instance_type["hostname_convention"]} return None def getQueueLimits(config,queue_name,instance_type_name): @@ -136,11 +136,11 @@ def getQueueLimits(config,queue_name,instance_type_name): return {"max_number_nodes": int(instance_type["max_number_nodes"]), "max_cluster_size": int(instance_type["max_cluster_size"]),"max_cluster_count": int(instance_type["max_cluster_count"])} return {"max_number_nodes": 0, "max_cluster_size": 0,"max_cluster_count": 0} -def getInstanceType(config,queue_name,instance_keyword): +def getInstanceType(config,queue_name,hostname_convention): for partition in config: if queue_name == partition["name"]: for instance_type in partition["instance_types"]: - if instance_keyword == instance_type["instance_keyword"]: + if hostname_convention == instance_type["hostname_convention"]: return instance_type["name"] return None @@ -161,26 +161,33 @@ def getAllClusterNames(config): return availableNames def getClusterName(node): - out = subprocess.Popen(['scontrol','show','topology',node], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) - stdout,stderr = out.communicate() - clusterName = None - try: - if len(stdout.split('\n')) > 2: - for output in stdout.split('\n')[:-1]: - if "Switches=" in output: - clusterName=output.split()[0].split('SwitchName=')[1] - break - elif "SwitchName=inactive-" in output: - continue - else: - clusterName=output.split()[0].split('SwitchName=')[1] - elif len(stdout.split('\n')) == 2: - clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1] - if clusterName.startswith("inactive-"): + details=getNodeDetails(node) + clusterName="NOCLUSTERFOUND" + for feature in details[0].split(","): + if feature.startswith('CN__'): + clusterName=feature[4:] + if clusterName == "NOCLUSTERFOUND": + out = subprocess.Popen(['scontrol','show','topology',node], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) + stdout,stderr = out.communicate() + clusterName = None + try: + if len(stdout.split('\n')) > 2: + for output in stdout.split('\n')[:-1]: + if "Switches=" in output: + clusterName=output.split()[0].split('SwitchName=')[1] + break + elif "SwitchName=inactive-" in output: + continue + else: + clusterName=output.split()[0].split('SwitchName=')[1] + elif len(stdout.split('\n')) == 2: + clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1] + if clusterName.startswith("inactive-"): + return "NOCLUSTERFOUND" + except: + print('No ClusterName could be found for '+node) + print('There seems to be some issues in the slurm topology file') return "NOCLUSTERFOUND" - except: - print('No ClusterName could be found for '+node) - return "NOCLUSTERFOUND" return clusterName def getstatus_slurm(): @@ -246,7 +253,7 @@ def getstatus_slurm(): clustername=getClusterName(node) if clustername is None: continue - instanceType=features[-1] + instanceType=features[0] if queue in current_nodes.keys(): if instanceType in current_nodes[queue].keys(): current_nodes[queue][instanceType]+=1 @@ -276,7 +283,9 @@ def getstatus_slurm(): cluster_to_destroy=[] for clustername in nodes_to_destroy_temp.keys(): destroyEntireCluster=True - if clustername in running_cluster or clustername == "NOCLUSTERFOUND": + if clustername == "NOCLUSTERFOUND": + destroyEntireCluster=False + elif clustername in running_cluster: nodes_to_destroy[clustername]=nodes_to_destroy_temp[clustername] destroyEntireCluster=False else: @@ -295,10 +304,10 @@ def getstatus_slurm(): for clusterName in os.listdir(clusters_path): if len(clusterName.split('-')) < 3: continue - instance_keyword='-'.join(clusterName.split('-')[2:]) + hostname_convention='-'.join(clusterName.split('-')[2:]) clusterNumber=int(clusterName.split('-')[1]) queue=clusterName.split('-')[0] - instanceType=getInstanceType(config,queue,instance_keyword) + instanceType=getInstanceType(config,queue,hostname_convention) if not queue in used_index.keys(): used_index[queue]={} if not instanceType in used_index[queue].keys(): @@ -311,19 +320,19 @@ def getstatus_slurm(): nodes = line.split()[0] instance_type = line.split()[1] queue = line.split()[2] - try: - cluster_building.append([int(nodes),instance_type,queue]) - if queue in building_nodes.keys(): - if instance_type in building_nodes[queue].keys(): - building_nodes[queue][instance_type]+=int(nodes) + try: + cluster_building.append([int(nodes),instance_type,queue]) + if queue in building_nodes.keys(): + if instance_type in building_nodes[queue].keys(): + building_nodes[queue][instance_type]+=int(nodes) + else: + building_nodes[queue][instance_type]=int(nodes) else: - building_nodes[queue][instance_type]=int(nodes) - else: - building_nodes[queue]={instance_type:int(nodes)} - except ValueError: - print ('The cluster '+ clusterName + ' does not have a valid entry for \"currently_building\"') - print ('Ignoring') - continue + building_nodes[queue]={instance_type:int(nodes)} + except ValueError: + print ('The cluster '+ clusterName + ' does not have a valid entry for \"currently_building\"') + print ('Ignoring') + continue if os.path.isfile(os.path.join(clusters_path,clusterName,'currently_destroying')): cluster_destroying.append(clusterName) return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes @@ -422,7 +431,7 @@ if autoscaling == "true": nextIndex=i used_index[queue][instance_type].append(i) break - clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"] + clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["hostname_convention"] if not queue in current_nodes.keys(): current_nodes[queue]={instance_type:0} else: @@ -448,5 +457,5 @@ if autoscaling == "true": traceback.print_exc() os.remove(lockfile) else: - print("Autoscaling is false") + print("Autoscaling is false (set in /etc/ansible/hosts)") exit() diff --git a/autoscaling/tf_init/cluster-network.tf b/autoscaling/tf_init/cluster-network.tf index d7b0b4f0..c3218b1f 100755 --- a/autoscaling/tf_init/cluster-network.tf +++ b/autoscaling/tf_init/cluster-network.tf @@ -28,6 +28,8 @@ resource "oci_core_cluster_network" "cluster_network" { } freeform_tags = { "user" = var.tags + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name } placement_configuration { availability_domain = var.ad diff --git a/autoscaling/tf_init/compute-cluster.tf b/autoscaling/tf_init/compute-cluster.tf index ef9067b8..1b5b7dfa 100755 --- a/autoscaling/tf_init/compute-cluster.tf +++ b/autoscaling/tf_init/compute-cluster.tf @@ -7,6 +7,7 @@ resource "oci_core_compute_cluster" "compute_cluster" { #Optional display_name = local.cluster_name freeform_tags = { + "user" = var.tags "cluster_name" = local.cluster_name "parent_cluster" = local.cluster_name } diff --git a/autoscaling/tf_init/controller_update.tf b/autoscaling/tf_init/controller_update.tf index ec4ec5ac..1f4d36db 100755 --- a/autoscaling/tf_init/controller_update.tf +++ b/autoscaling/tf_init/controller_update.tf @@ -1,6 +1,6 @@ locals { - controller_path = "${var.autoscaling_folder}/clusters/${var.cluster_name}" + controller_path = "${var.autoscaling_folder}/clusters/${local.cluster_name}" } resource "null_resource" "create_path" { @@ -12,7 +12,7 @@ resource "null_resource" "create_path" { resource "local_file" "hosts" { depends_on = [null_resource.create_path,oci_core_cluster_network.cluster_network] content = join("\n", local.cluster_instances_ips) - filename = "${local.controller_path}/hosts_${var.cluster_name}" + filename = "${local.controller_path}/hosts_${local.cluster_name}" } resource "local_file" "inventory" { @@ -24,6 +24,8 @@ resource "local_file" "inventory" { backup_ip = var.backup_ip, login_name = var.login_name, login_ip = var.login_ip, + monitoring_name = var.monitoring_name, + monitoring_ip = var.monitoring_ip, compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = var.public_subnet, private_subnet = var.private_subnet, @@ -66,7 +68,7 @@ resource "local_file" "inventory" { instance_pool_ocpus=local.instance_pool_ocpus, queue=var.queue, instance_type=var.instance_type, - monitoring=var.monitoring, + cluster_monitoring=var.cluster_monitoring, autoscaling_monitoring = var.autoscaling_monitoring, unsupported = var.unsupported, hyperthreading = var.hyperthreading, @@ -78,7 +80,9 @@ resource "local_file" "inventory" { pam = var.pam, sacct_limits = var.sacct_limits, use_compute_agent=var.use_compute_agent, - healthchecks=var.healthchecks + healthchecks=var.healthchecks, + change_hostname=var.change_hostname, + hostname_convention=var.hostname_convention }) filename = "${local.controller_path}/inventory" } diff --git a/autoscaling/tf_init/instance-pool.tf b/autoscaling/tf_init/instance-pool.tf index 37ca4b2e..a5088bb8 100755 --- a/autoscaling/tf_init/instance-pool.tf +++ b/autoscaling/tf_init/instance-pool.tf @@ -27,6 +27,8 @@ resource "oci_core_instance_pool" "instance_pool" { display_name = local.cluster_name freeform_tags = { "user" = var.tags + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name } placement_configurations { availability_domain = var.ad diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 24a2355d..f511e48a 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -4,6 +4,8 @@ ${controller_name} ansible_host=${controller_ip} ansible_user=${controller_usern %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${controller_username} role=controller%{ endif } [login] %{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif } +[monitoring] +%{ if monitoring_name != "" }${monitoring_name} ansible_host=${monitoring_ip} ansible_user=${compute_username} role=monitoring%{ endif } [compute_to_add] [compute_configured] %{ for host, ip in compute ~} @@ -62,7 +64,7 @@ log_vol=${log_vol} ldap=${ldap} queue=${queue} instance_type=${instance_type} -monitoring=${monitoring} +cluster_monitoring=${cluster_monitoring} hyperthreading=${hyperthreading} privilege_sudo=${privilege_sudo} privilege_group_name=${privilege_group_name} @@ -74,4 +76,6 @@ sacct_limits=${sacct_limits} use_compute_agent=${use_compute_agent} zone_name=${zone_name} dns_entries=${dns_entries} -healthchecks=${healthchecks} \ No newline at end of file +healthchecks=${healthchecks} +change_hostname=${change_hostname} +hostname_convention=${hostname_convention} \ No newline at end of file diff --git a/autoscaling/tf_init/locals.tf b/autoscaling/tf_init/locals.tf index 4effdfb6..26f75956 100755 --- a/autoscaling/tf_init/locals.tf +++ b/autoscaling/tf_init/locals.tf @@ -38,6 +38,6 @@ locals { timeout_per_batch= var.cluster_network ? var.use_multiple_ads ? 15 : 30 : var.use_multiple_ads ? 6 : 15 timeout_ip = join("",[ (( var.node_count - ( var.node_count % 20 ) )/20 + 1 ) * local.timeout_per_batch,"m"]) - platform_type = local.shape == "BM.GPU4.8" ? "AMD_ROME_BM_GPU" : local.shape == "BM.GPU.B4.8" || local.shape == "BM.GPU.H100.8" || local.shape == "BM.GPU.A100-v2.8" ? "AMD_MILAN_BM_GPU" : local.shape == "BM.Standard.E3.128" ? "AMD_ROME_BM" : local.shape == "BM.Standard.E4.128" || local.shape == "BM.DenseIO.E4.128" ? "AMD_MILAN_BM" : "GENERIC_BM" + platform_type = local.shape == "BM.GPU4.8" ? "AMD_ROME_BM_GPU" : local.shape == "BM.GPU.B4.8" || local.shape == "BM.GPU.A100-v2.8" ? "AMD_MILAN_BM_GPU" : local.shape == "BM.Standard.E3.128" ? "AMD_ROME_BM" : local.shape == "BM.Standard.E4.128" || local.shape == "BM.DenseIO.E4.128" ? "AMD_MILAN_BM" : "GENERIC_BM" } diff --git a/autoscaling/tf_init/network.tf b/autoscaling/tf_init/network.tf index 5c6404d6..8a355c18 100755 --- a/autoscaling/tf_init/network.tf +++ b/autoscaling/tf_init/network.tf @@ -183,10 +183,10 @@ resource "oci_dns_rrset" "rrset-cluster-network-SLURM" { for_each = var.slurm && var.dns_entries ? toset([for v in range(var.node_count) : tostring(v)]) : [] zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id - domain = "${var.queue}-${var.instance_type}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}" + domain = "${var.hostname_convention}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}" rtype = "A" items { - domain = "${var.queue}-${var.instance_type}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}" + domain = "${var.hostname_convention}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}" rtype = "A" rdata = "${local.cluster_instances_ips[tonumber(each.key)]}" ttl = 3600 diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf index ec66572c..65f73377 100755 --- a/autoscaling/tf_init/versions.tf +++ b/autoscaling/tf_init/versions.tf @@ -1,9 +1,9 @@ terraform { - required_version = ">= 1.0" + required_version = ">= 1.2" required_providers { oci = { source = "oracle/oci" - version = "5.37.0" + version = "6.9.0" } } } \ No newline at end of file diff --git a/bin/create_cluster.sh b/bin/create_cluster.sh index df2bb3d5..4bfc9a5b 100755 --- a/bin/create_cluster.sh +++ b/bin/create_cluster.sh @@ -51,6 +51,8 @@ hyperthreading=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types. region=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.region " $queues_conf` private_subnet=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.private_subnet " $queues_conf` private_subnet_id=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.private_subnet_id " $queues_conf` +change_hostname=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.change_hostname " $queues_conf` +hostname_convention=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.hostname_convention " $queues_conf` @@ -67,7 +69,7 @@ do echo $1 $3 $4 >> currently_building echo $3 $4 > cluster_options - sed "s~##NODES##~$1~g;s~##NAME##~$2~g;s~##SHAPE##~$shape~g;s~##CN##~$cluster_network~g;s~##QUEUE##~${4}~g;s~##COMP##~${targetCompartment}~g;s~##AD##~${ADName}~g;s~##BOOT##~${boot_volume_size}~g;s~##USEMP##~${use_marketplace_image}~g;s~##IMAGE##~${image}~g;s~##OCPU##~${instance_pool_ocpus}~g;s~##MEM##~${instance_pool_memory}~g;s~##CUSTOM_MEM##~${instance_pool_custom_memory}~g;s~##MP_LIST##~${marketplace_listing}~g;s~##HT##~${hyperthreading}~g;s~##INST_TYPE##~$3~g;s~##TAGS##~$tags~g;s~##REGION##~${region}~g;s~##PRIVATE_SUBNET_ID##~${private_subnet_id}~g;s~##PRIVATE_SUBNET##~${private_subnet}~g;s~##CC##~$compute_cluster~g" $conf_folder/variables.tf > variables.tf + sed "s~##NODES##~$1~g;s~##NAME##~$2~g;s~##SHAPE##~$shape~g;s~##CN##~$cluster_network~g;s~##QUEUE##~${4}~g;s~##COMP##~${targetCompartment}~g;s~##AD##~${ADName}~g;s~##BOOT##~${boot_volume_size}~g;s~##USEMP##~${use_marketplace_image}~g;s~##IMAGE##~${image}~g;s~##OCPU##~${instance_pool_ocpus}~g;s~##MEM##~${instance_pool_memory}~g;s~##CUSTOM_MEM##~${instance_pool_custom_memory}~g;s~##MP_LIST##~${marketplace_listing}~g;s~##HT##~${hyperthreading}~g;s~##INST_TYPE##~$3~g;s~##TAGS##~$tags~g;s~##REGION##~${region}~g;s~##PRIVATE_SUBNET_ID##~${private_subnet_id}~g;s~##CH_HOST##~${change_hostname}~g;s~##HOST_CONV##~${hostname_convention}~g;s~##PRIVATE_SUBNET##~${private_subnet}~g;s~##CC##~$compute_cluster~g" $conf_folder/variables.tf > variables.tf echo "Started to build $2" start=`date -u +%s` diff --git a/bin/resize.py b/bin/resize.py index d01ceea3..7a416c47 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -31,13 +31,13 @@ def wait_for_running_status(cluster_name,comp_ocid,cn_ocid,CN,expected_size=None instances=computeManagementClient.list_instance_pool_instances(comp_ocid,cn_ocid).data if state != 'RUNNING': print("Cluster state is "+state+", cannot add or remove nodes") - print ("Waiting...") + print("Waiting...") time.sleep(30) elif not expected_size is None: if expected_size == len(instances): break else: - print("The instance list does not match the expected size") + print("STDOUT: The instance list does not match the expected size") time.sleep(30) else: break @@ -57,7 +57,7 @@ def get_instances(comp_ocid,cn_ocid,CN): vnic = virtualNetworkClient.get_vnic(vnic_attachment.vnic_id).data except: continue - cn_instances.append({'display_name':instance.display_name,'ip':vnic.private_ip,'ocid':instance.id}) + cn_instances.append({'display_name':instance.display_name,'ip':vnic.private_ip,'ocid':instance.id}) else: if CN == "CN": instance_summaries = oci.pagination.list_call_get_all_results(computeManagementClient.list_cluster_network_instances,comp_ocid,cn_ocid).data @@ -72,7 +72,7 @@ def get_instances(comp_ocid,cn_ocid,CN): vnic = virtualNetworkClient.get_vnic(vnic_attachment.vnic_id).data except: continue - cn_instances.append({'display_name':instance_summary.display_name,'ip':vnic.private_ip,'ocid':instance_summary.id}) + cn_instances.append({'display_name':instance_summary.display_name,'ip':vnic.private_ip,'ocid':instance_summary.id}) return cn_instances def parse_inventory(inventory): @@ -138,10 +138,10 @@ def backup_inventory(inventory): print("File "+tmp_file_do_not_edit+" exist, it means previous reconfigure had failed. Hence updating inventory to previous state") shutil.move(tmp_file_do_not_edit,inventory) -def destroy_unreachable_reconfigure(inventory,nodes_to_remove,playbook): +def destroy_unreachable_reconfigure(inventory,nodes_to_remove,playbook): if not os.path.isfile("/etc/ansible/hosts"): - print("There is no inventory file, are you on the controller? The cluster has not been resized") - exit() + print("STDOUT: There is no inventory file, are you on the controller? The cluster has not been resized") + exit(1) backup_inventory(inventory) inventory_dict = parse_inventory(inventory) tmp_inventory_destroy="/tmp/"+inventory.replace('/','_')+"_destroy" @@ -167,11 +167,12 @@ def destroy_unreachable_reconfigure(inventory,nodes_to_remove,playbook): if instance['display_name'] in nodes_to_remove and not instance['ip'] in ips_to_remove: ips_to_remove.append(instance['ip']) if len(ips_to_remove) != len(nodes_to_remove): - print("Some nodes are removed in OCI and removed from the inventory") - print("Try rerunning with the --nodes option and a list of IPs or Slurm Hostnames to cleanup the controller") + print("STDOUT: Some nodes are removed in OCI and removed from the inventory") + print("STDOUT: Try rerunning with the --nodes option and a list of IPs or Slurm Hostnames to cleanup the controller") write_inventory(inventory_dict,tmp_inventory_destroy) - if not len(ips_to_remove): - print("No hostname found, trying anyway with "+" ".join(nodes_to_remove)) + if not len(ips_to_remove) or not slurm_name_change: + if not len(ips_to_remove): + print("STDOUT: No hostname found, trying anyway with "+" ".join(nodes_to_remove)) for node in nodes_to_remove: # Temporary fix while the playbook is changed to be able to run multiple at the time update_flag = update_cluster(tmp_inventory_destroy,playbook,add_vars={"unreachable_node_list":node}) time.sleep(10) @@ -190,8 +191,8 @@ def destroy_unreachable_reconfigure(inventory,nodes_to_remove,playbook): def destroy_reconfigure(inventory,nodes_to_remove,playbook): if not os.path.isfile("/etc/ansible/hosts"): - print("There is no inventory file, are you on the controller? The cluster has not been resized") - exit() + print("STDOUT: There is no inventory file, are you on the controller? The cluster has not been resized") + exit(1) backup_inventory(inventory) inventory_dict = parse_inventory(inventory) inventory_dict['compute_to_destroy']=[] @@ -262,8 +263,8 @@ def add_reconfigure(comp_ocid,cn_ocid,inventory,CN,specific_hosts=None): reachable_instances=instances unreachable_instances=[] if not os.path.isfile(inventory): - print("There is no inventory file, are you on the controller? The cluster has been resized but not reconfigured") - exit() + print("STDOUT: There is no inventory file, are you on the controller? The cluster has been resized but not reconfigured") + exit(1) host_to_wait_for=[] for node in reachable_instances: name=node['display_name'] @@ -303,14 +304,15 @@ def add_reconfigure(comp_ocid,cn_ocid,inventory,CN,specific_hosts=None): write_inventory(inventory_dict,tmp_inventory) os.system('sudo mv '+tmp_inventory+' '+inventory) else: - print("The reconfiguration to add the node(s) had an error") - print("Try rerunning this command: ansible-playbook -i "+tmp_inventory_add+' '+playbooks_dir+"resize_add.yml" ) + print("STDOUT: The reconfiguration to add the node(s) had an error") + print("STDOUT: Try rerunning this command: ansible-playbook -i "+tmp_inventory_add+' '+playbooks_dir+"resize_add.yml" ) + exit(1) def reconfigure(comp_ocid,cn_ocid,inventory,CN, crucial=False): instances = get_instances(comp_ocid,cn_ocid,CN) if not os.path.isfile(inventory): - print("There is no inventory file, are you on the controller? Reconfigure did not happen") - exit() + print("STDOUT: There is no inventory file, are you on the controller? Reconfigure did not happen") + exit(1) backup_inventory(inventory) inventory_dict = parse_inventory(inventory) host_to_wait_for=[] @@ -465,7 +467,7 @@ def get_summary(comp_ocid,cluster_name): elif cn_summary_tmp.lifecycle_state == "SCALING": scaling_clusters = scaling_clusters + 1 if running_clusters == 0: - try: + try: cn_summaries = computeClient.list_compute_clusters(comp_ocid,display_name=cluster_name).data.items except: print("The list_compute_clusters call returned an error, considering no Compute CLusters are present") @@ -475,7 +477,7 @@ def get_summary(comp_ocid,cluster_name): for cn_summary_tmp in cn_summaries: if cn_summary_tmp.lifecycle_state == "ACTIVE" and cn_summary_tmp.display_name == cluster_name : cn_summary = cn_summary_tmp - running_clusters = running_clusters + 1 + running_clusters = running_clusters + 1 if running_clusters == 0: cn_summaries = computeManagementClient.list_instance_pools(comp_ocid,display_name=cluster_name).data if len(cn_summaries) > 0: @@ -483,7 +485,7 @@ def get_summary(comp_ocid,cluster_name): for cn_summary_tmp in cn_summaries: if cn_summary_tmp.lifecycle_state == "RUNNING": cn_summary = cn_summary_tmp - running_clusters = running_clusters + 1 + running_clusters = running_clusters + 1 elif cn_summary_tmp.lifecycle_state == "SCALING": scaling_clusters = scaling_clusters + 1 if running_clusters == 0: @@ -554,7 +556,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index create_vnic_details=oci.core.models.CreateVnicDetails(assign_public_ip=False,subnet_id=vnic_attachment.subnet_id) shape_config=instance.shape_config - try: + try: nvmes=shape_config.local_disks launchInstanceShapeConfigDetails = oci.core.models.LaunchInstanceShapeConfigDetails(baseline_ocpu_utilization=shape_config.baseline_ocpu_utilization,memory_in_gbs=shape_config.memory_in_gbs,nvmes=nvmes,ocpus=shape_config.ocpus) except: @@ -563,7 +565,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index splitted_name[-1]=str(max_previous_index+1+index) new_display_name = '-'.join(splitted_name) launch_instance_details=oci.core.models.LaunchInstanceDetails(agent_config=agent_config,availability_domain=instance.availability_domain, compartment_id=comp_ocid,compute_cluster_id=cn_ocid,shape=instance.shape,shape_config=launchInstanceShapeConfigDetails,source_details=instance.source_details,metadata=instance.metadata,display_name=new_display_name,freeform_tags=instance.freeform_tags,create_vnic_details=create_vnic_details) - return launch_instance_details + return launch_instance_details batchsize=12 inventory="/etc/ansible/hosts" @@ -620,10 +622,10 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index if inv_vars.startswith("dns_entries"): dns_entries=bool(inv_vars.split("dns_entries=")[1].strip()) break -queue=None +hostname_convention=None for inv_vars in inventory_dict["all:vars"]: - if inv_vars.startswith("queue"): - queue=inv_vars.split("queue=")[1].strip() + if inv_vars.startswith("hostname_convention"): + hostname_convention=inv_vars.split("hostname_convention=")[1].strip() break instance_type="" for inv_vars in inventory_dict["all:vars"]: @@ -635,6 +637,11 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index if inv_vars.startswith("private_subnet"): private_subnet_cidr=ipaddress.ip_network(inv_vars.split("private_subnet=")[1].strip()) break +slurm_name_change=None +for inv_vars in inventory_dict["all:vars"]: + if inv_vars.startswith("change_hostname"): + slurm_name_change=bool(inv_vars.split("change_hostname=")[1].strip()) + break hostnames=args.nodes if hostnames is None: @@ -642,11 +649,11 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index if args.mode=='remove' and args.number is None and args.nodes is None: print("STDOUT: No Nodes to remove") - exit() + exit(1) if args.mode=='add' and args.number is None: print("STDOUT: No Nodes to add") - exit() + exit(1) if args.no_reconfigure is None: no_reconfigure=False @@ -692,7 +699,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index cn_summary,ip_summary,CN = get_summary(comp_ocid,cluster_name) if cn_summary is None: - exit() + exit(1) cn_ocid =cn_summary.id if CN != "CC": @@ -719,7 +726,8 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index cn_instances = get_instances(comp_ocid,cn_ocid,CN) inventory_instances =[] only_inventory_instance=[] - zone_id=dns_client.list_zones(compartment_id=comp_ocid,name=zone_name,zone_type="PRIMARY",scope="PRIVATE").data[0].id + if dns_entries: + zone_id=dns_client.list_zones(compartment_id=comp_ocid,name=zone_name,zone_type="PRIMARY",scope="PRIVATE").data[0].id for line in inventory_dict['compute_configured']: host=line.split('ansible_host=')[0].strip() ip=line.split("ansible_host=")[1].split("ansible_user=")[0].strip() @@ -736,7 +744,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index print("STDOUT: "+host+" with IP: "+ip+" is in the inventory but not in the cluster") only_inventory_instance.append({'display_name':host,'ip':ip,'ocid':None}) if args.mode == 'remove_unreachable': - if len(hostnames) == 0: + if len(hostnames) == 0: reachable_instances,unreachable_instances=getreachable(cn_instances+only_inventory_instance,username,delay=10) if len(unreachable_instances): hostnames_to_remove=[i['display_name'] for i in unreachable_instances] @@ -775,6 +783,10 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index else: hostnames_to_remove=[cn_instances[i]['display_name'] for i in range(len(cn_instances))] else: + # first check if all hostnames passed via --nodes exist. Exit if that's not the case + for host in hostnames: + if host not in [i["display_name"] for i in cn_instances]: + print(f"ERROR: node {host} does not appear to exist in this cluster. Please check your arguments and rerun") hostnames_to_remove2 = list(hostnames) hostnames_to_remove2.extend(x for x in hostnames_to_remove if x not in hostnames_to_remove2) hostnames_to_remove=hostnames_to_remove2 @@ -791,30 +803,30 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index print("STDOUT: Force deleting the nodes") terminated_instances=0 cn_summary,ip_summary,CN = get_summary(comp_ocid,cluster_name) - if CN != "CC": + if CN != "CC": current_size = ip_summary.size for instanceName in hostnames_to_remove: try: instance_id = computeClient.list_instances(comp_ocid,display_name=instanceName).data[0].id if CN == "CC": ComputeClientCompositeOperations.terminate_instance_and_wait_for_state(instance_id,wait_for_states=["TERMINATING","TERMINATED"]) - else: + else: instance_details = oci.core.models.DetachInstancePoolInstanceDetails(instance_id=instance_id,is_auto_terminate=True,is_decrement_size=True) ComputeManagementClientCompositeOperations.detach_instance_pool_instance_and_wait_for_work_request(ipa_ocid,instance_details) if dns_entries: get_rr_set_response = dns_client.delete_rr_set(zone_name_or_id=zone_id,domain=instanceName+"."+zone_name,rtype="A",scope="PRIVATE") ip=None - for i in cn_instances: + for i in cn_instances: if i['display_name'] == instanceName: ip = ipaddress.ip_address(i['ip']) if not ip is None: index = list(private_subnet_cidr.hosts()).index(ip)+2 - slurm_name=queue+"-"+instance_type+"-"+str(index)+"."+zone_name + slurm_name=hostname_convention+"-"+str(index)+"."+zone_name get_rr_set_response = dns_client.delete_rr_set(zone_name_or_id=zone_id,domain=slurm_name,rtype="A",scope="PRIVATE") terminated_instances = terminated_instances + 1 - print("STDOUT: The instance "+instanceName+" is terminating") + print("STDOUT: The instance "+instanceName+" is terminating") except: - print("The instance "+instanceName+" does not exist") + print("STDOUT: The instance "+instanceName+" does not exist") cn_summary,ip_summary,CN = get_summary(comp_ocid,cluster_name) if CN == "CC": instance_id = computeClient.list_instances(comp_ocid,display_name=hostnames_to_remove[-1]).data[0].id @@ -823,7 +835,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index instance_state = computeClient.get_instance(instance_id).data.lifecycle_state if instance_state == "TERMINATED": break - else: + else: time.sleep(10) except: break @@ -842,7 +854,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index if CN == "CC": current_size=len(cn_instances) if len(cn_instances) == 0: - print("The resize script cannot work for a compute cluster if the size is there is no node in the cluster") + print("STDOUT: The resize script cannot work for a compute cluster if the size is there is no node in the cluster") else: for cn_instance in cn_instances: max_index=-1 @@ -870,12 +882,12 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index instanceName=new_instance['display_name'] ip = ipaddress.ip_address(new_instance['ip']) index = list(private_subnet_cidr.hosts()).index(ip)+2 - slurm_name=queue+"-"+instance_type+"-"+str(index)+"."+zone_name + slurm_name=hostname_convention+"-"+str(index)+"."+zone_name get_rr_set_response = dns_client.update_rr_set(zone_name_or_id=zone_id,domain=slurm_name,rtype="A",scope="PRIVATE",update_rr_set_details=oci.dns.models.UpdateRRSetDetails(items=[oci.dns.models.RecordDetails(domain=slurm_name,rdata=new_instance['ip'],rtype="A",ttl=3600,)])) get_rr_set_response = dns_client.update_rr_set(zone_name_or_id=zone_id,domain=instanceName+"."+zone_name,rtype="A",scope="PRIVATE",update_rr_set_details=oci.dns.models.UpdateRRSetDetails(items=[oci.dns.models.RecordDetails(domain=instanceName+"."+zone_name,rdata=new_instance['ip'],rtype="A",ttl=3600)])) updateTFState(inventory,cluster_name,newsize) if newsize == current_size: - print("No node was added, please check the work requests of the Cluster Network and Instance Pool to see why") + print("STDOUT: No node was added, please check the work requests of the Cluster Network and Instance Pool to see why") exit(1) if not no_reconfigure: add_reconfigure(comp_ocid,cn_ocid,inventory,CN) \ No newline at end of file diff --git a/bin/resize.sh b/bin/resize.sh index cbcb4232..8cd856d5 100755 --- a/bin/resize.sh +++ b/bin/resize.sh @@ -21,9 +21,9 @@ then exit fi -if [ $# -eq 0 ] +if [ $# -eq 0 ] || [ $1 == "--help" ] then - python3 $folder/resize.py --help + /usr/bin/python3 $folder/resize.py --help exit fi @@ -104,7 +104,7 @@ then mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE cluster_log.clusters SET started_resize='$start_timestamp',state='resizing' WHERE id='$cluster_id'" >> $log 2>&1 fi - python3 $folder/resize.py ${@} | tee -a $log 2>&1 | grep STDOUT + /usr/bin/python3 $folder/resize.py ${@} | tee -a $log 2>&1 | grep STDOUT status=${PIPESTATUS[0]} end=`date -u +%s` end_timestamp=`date -u +'%F %T'` @@ -115,7 +115,7 @@ then echo "Successfully Resized cluster $cluster_name in $runtime seconds" if [ -f $monitoring_folder/activated ] then - nodes_list=`python3 $folder/resize.py --cluster_name $cluster_name list | grep ocid1.instance` + nodes_list=`/usr/bin/python3 $folder/resize.py --cluster_name $cluster_name list | grep ocid1.instance` length=`echo $nodes_list | wc -w` newSize=$((length/3)) @@ -170,5 +170,5 @@ then rm currently_resizing fi else - python3 $folder/resize.py ${@} & + /usr/bin/python3 $folder/resize.py ${@} fi diff --git a/bin/slurm_config.sh b/bin/slurm_config.sh index 99aae8d0..8bc157d1 100644 --- a/bin/slurm_config.sh +++ b/bin/slurm_config.sh @@ -13,26 +13,22 @@ playbooks_path=$folder/../playbooks/ source /etc/os-release -if [[ `cat $conf_folder/queues.conf | grep instance_keyword | uniq -c -d | wc -l ` == 0 ]]; + +if [[ ${@: -1} == "--INITIAL" || ${@: -1} == "--initial" || ${@: -1} == "-INITIAL" || ${@: -1} == "-initial" ]] then - if [[ ${@: -1} == "--INITIAL" || ${@: -1} == "--initial" || ${@: -1} == "-INITIAL" || ${@: -1} == "-initial" ]] - then - sudo rm /etc/slurm/topology.conf - sudo /usr/sbin/slurmctld -c - fi - ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook $playbooks_path/slurm_config.yml - if [[ ${@: -1} == "--INITIAL" || ${@: -1} == "--initial" || ${@: -1} == "-INITIAL" || ${@: -1} == "-initial" ]] - then - for inventory in /opt/oci-hpc/autoscaling/clusters/*/inventory ; - do - if [ -f $(dirname $inventory)/currently* ] - then - echo "Cluster is not in running state" - else - ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook $playbooks_path/slurm_config_as.yml -i $inventory - fi - done - fi - else - echo "There are some duplicates instance_keyword lines, please make them unique" + sudo rm /etc/slurm/topology.conf + sudo /usr/sbin/slurmctld -c fi +ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook $playbooks_path/slurm_config.yml +if [[ ${@: -1} == "--INITIAL" || ${@: -1} == "--initial" || ${@: -1} == "-INITIAL" || ${@: -1} == "-initial" ]] +then + for inventory in /opt/oci-hpc/autoscaling/clusters/*/inventory ; + do + if [ -f $(dirname $inventory)/currently* ] + then + echo "Cluster is not in running state" + else + ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook $playbooks_path/slurm_config_as.yml -i $inventory + fi + done +fi \ No newline at end of file diff --git a/cluster-network-configuration.tf b/cluster-network-configuration.tf index f2772b2a..f00e2338 100755 --- a/cluster-network-configuration.tf +++ b/cluster-network-configuration.tf @@ -1,5 +1,5 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configuration" { - count = ( ! var.compute_cluster ) && var.cluster_network ? 1 : 0 + count = (!var.compute_cluster) && var.cluster_network ? 1 : 0 depends_on = [oci_core_app_catalog_subscription.mp_image_subscription] compartment_id = var.targetCompartment display_name = local.cluster_name @@ -13,11 +13,11 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati } display_name = local.cluster_name freeform_tags = { - "cluster_name" = local.cluster_name + "cluster_name" = local.cluster_name "parent_cluster" = local.cluster_name } metadata = { -# TODO: add user key to the authorized_keys + # TODO: add user key to the authorized_keys ssh_authorized_keys = "${var.ssh_key}\n${tls_private_key.ssh.public_key_openssh}" user_data = base64encode(data.template_file.config.rendered) } @@ -30,27 +30,27 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati plugins_config { desired_state = "DISABLED" name = "OS Management Service Agent" - } - dynamic plugins_config { - + } + dynamic "plugins_config" { + for_each = var.use_compute_agent ? ["ENABLED"] : ["DISABLED"] content { - name = "Compute HPC RDMA Authentication" - desired_state = plugins_config.value - } - } - dynamic plugins_config { + name = "Compute HPC RDMA Authentication" + desired_state = plugins_config.value + } + } + dynamic "plugins_config" { for_each = var.use_compute_agent ? ["ENABLED"] : ["DISABLED"] content { - name = "Compute HPC RDMA Auto-Configuration" - desired_state = plugins_config.value + name = "Compute HPC RDMA Auto-Configuration" + desired_state = plugins_config.value } } - dynamic plugins_config { + dynamic "plugins_config" { for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"] content { - name = "Compute RDMA GPU Monitoring" - desired_state = plugins_config.value + name = "Compute RDMA GPU Monitoring" + desired_state = plugins_config.value } } } @@ -58,21 +58,22 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati dynamic "platform_config" { for_each = var.BIOS ? range(1) : [] content { - type = local.platform_type - are_virtual_instructions_enabled = var.virt_instr - is_access_control_service_enabled = var.access_ctrl + type = local.platform_type + are_virtual_instructions_enabled = var.virt_instr + is_access_control_service_enabled = var.access_ctrl is_input_output_memory_management_unit_enabled = var.IOMMU - is_symmetric_multi_threading_enabled = var.SMT - numa_nodes_per_socket = var.numa_nodes_per_socket == "Default" ? (local.platform_type == "GENERIC_BM" ? "NPS1": "NPS4" ): var.numa_nodes_per_socket - percentage_of_cores_enabled = var.percentage_of_cores_enabled == "Default" ? 100 : tonumber(var.percentage_of_cores_enabled) + is_symmetric_multi_threading_enabled = var.SMT + numa_nodes_per_socket = var.numa_nodes_per_socket == "Default" ? (local.platform_type == "GENERIC_BM" ? "NPS1" : "NPS4") : var.numa_nodes_per_socket + percentage_of_cores_enabled = var.percentage_of_cores_enabled == "Default" ? 100 : tonumber(var.percentage_of_cores_enabled) } } - + shape = var.cluster_network_shape source_details { source_type = "image" boot_volume_size_in_gbs = var.boot_volume_size + boot_volume_vpus_per_gb = 30 image_id = local.cluster_network_image } } diff --git a/cluster-network.tf b/cluster-network.tf index acc39040..8c87e052 100755 --- a/cluster-network.tf +++ b/cluster-network.tf @@ -1,24 +1,24 @@ -resource "oci_core_volume" "nfs-cluster-network-volume" { - count = ( ! var.compute_cluster ) && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 +resource "oci_core_volume" "nfs-cluster-network-volume" { + count = (!var.compute_cluster) && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 availability_domain = var.ad - compartment_id = var.targetCompartment - display_name = "${local.cluster_name}-nfs-volume" - + compartment_id = var.targetCompartment + display_name = "${local.cluster_name}-nfs-volume" + size_in_gbs = var.cluster_block_volume_size vpus_per_gb = split(".", var.cluster_block_volume_performance)[0] } -resource "oci_core_volume_attachment" "cluster_network_volume_attachment" { - count = ( ! var.compute_cluster ) && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 +resource "oci_core_volume_attachment" "cluster_network_volume_attachment" { + count = (!var.compute_cluster) && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 attachment_type = "iscsi" volume_id = oci_core_volume.nfs-cluster-network-volume[0].id instance_id = local.cluster_instances_ids[0] display_name = "${local.cluster_name}-cluster-network-volume-attachment" device = "/dev/oracleoci/oraclevdb" -} +} resource "oci_core_cluster_network" "cluster_network" { - count = ( ! var.compute_cluster ) && var.cluster_network && var.node_count > 0 ? 1 : 0 + count = (!var.compute_cluster) && var.cluster_network && var.node_count > 0 ? 1 : 0 depends_on = [oci_core_app_catalog_subscription.mp_image_subscription, oci_core_subnet.private-subnet, oci_core_subnet.public-subnet, oci_core_instance.controller] compartment_id = var.targetCompartment instance_pools { @@ -27,8 +27,8 @@ resource "oci_core_cluster_network" "cluster_network" { display_name = local.cluster_name } freeform_tags = { - "cluster_name" = local.cluster_name - "parent_cluster" = local.cluster_name + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name } placement_configuration { availability_domain = var.ad diff --git a/compute-cluster.tf b/compute-cluster.tf index ef9067b8..7ea5e9be 100755 --- a/compute-cluster.tf +++ b/compute-cluster.tf @@ -1,13 +1,13 @@ resource "oci_core_compute_cluster" "compute_cluster" { count = var.compute_cluster && var.cluster_network && var.node_count > 0 ? 1 : 0 - #Required - availability_domain = var.ad - compartment_id = var.targetCompartment + #Required + availability_domain = var.ad + compartment_id = var.targetCompartment - #Optional - display_name = local.cluster_name - freeform_tags = { - "cluster_name" = local.cluster_name - "parent_cluster" = local.cluster_name + #Optional + display_name = local.cluster_name + freeform_tags = { + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name } } \ No newline at end of file diff --git a/compute-nodes.tf b/compute-nodes.tf index 4149e958..2a39192f 100755 --- a/compute-nodes.tf +++ b/compute-nodes.tf @@ -1,24 +1,24 @@ -resource "oci_core_volume" "nfs-compute-cluster-volume" { - count = var.compute_cluster && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 +resource "oci_core_volume" "nfs-compute-cluster-volume" { + count = var.compute_cluster && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 availability_domain = var.ad - compartment_id = var.targetCompartment - display_name = "${local.cluster_name}-nfs-volume" - + compartment_id = var.targetCompartment + display_name = "${local.cluster_name}-nfs-volume" + size_in_gbs = var.cluster_block_volume_size vpus_per_gb = split(".", var.cluster_block_volume_performance)[0] } -resource "oci_core_volume_attachment" "compute_cluster_volume_attachment" { - count = var.compute_cluster && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 +resource "oci_core_volume_attachment" "compute_cluster_volume_attachment" { + count = var.compute_cluster && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 attachment_type = "iscsi" volume_id = oci_core_volume.nfs-compute-cluster-volume[0].id instance_id = oci_core_instance.compute_cluster_instances[0].id display_name = "${local.cluster_name}-compute-cluster-volume-attachment" device = "/dev/oracleoci/oraclevdb" -} +} resource "oci_core_instance" "compute_cluster_instances" { - count = var.compute_cluster ? var.node_count : 0 + count = var.compute_cluster ? var.node_count : 0 depends_on = [oci_core_compute_cluster.compute_cluster] availability_domain = var.ad compartment_id = var.targetCompartment @@ -26,42 +26,42 @@ resource "oci_core_instance" "compute_cluster_instances" { agent_config { - are_all_plugins_disabled = false - is_management_disabled = true - is_monitoring_disabled = false + are_all_plugins_disabled = false + is_management_disabled = true + is_monitoring_disabled = false + + plugins_config { + desired_state = "DISABLED" + name = "OS Management Service Agent" + } + dynamic "plugins_config" { - plugins_config { - desired_state = "DISABLED" - name = "OS Management Service Agent" - } - dynamic plugins_config { - - for_each = var.use_compute_agent ? ["ENABLED"] : ["DISABLED"] - content { - name = "Compute HPC RDMA Authentication" - desired_state = plugins_config.value - } - } - dynamic plugins_config { - for_each = var.use_compute_agent ? ["ENABLED"] : ["DISABLED"] - content { - name = "Compute HPC RDMA Auto-Configuration" - desired_state = plugins_config.value - } - } - dynamic plugins_config { - for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"] - content { - name = "Compute RDMA GPU Monitoring" - desired_state = plugins_config.value - } - } + for_each = var.use_compute_agent ? ["ENABLED"] : ["DISABLED"] + content { + name = "Compute HPC RDMA Authentication" + desired_state = plugins_config.value } + } + dynamic "plugins_config" { + for_each = var.use_compute_agent ? ["ENABLED"] : ["DISABLED"] + content { + name = "Compute HPC RDMA Auto-Configuration" + desired_state = plugins_config.value + } + } + dynamic "plugins_config" { + for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"] + content { + name = "Compute RDMA GPU Monitoring" + desired_state = plugins_config.value + } + } + } - display_name = "${local.cluster_name}-node-${var.compute_cluster_start_index+count.index}" + display_name = "${local.cluster_name}-node-${var.compute_cluster_start_index + count.index}" freeform_tags = { - "cluster_name" = local.cluster_name + "cluster_name" = local.cluster_name "parent_cluster" = local.cluster_name } @@ -70,13 +70,14 @@ resource "oci_core_instance" "compute_cluster_instances" { user_data = base64encode(data.template_file.controller_config.rendered) } source_details { - source_id = local.cluster_network_image + source_id = local.cluster_network_image source_type = "image" boot_volume_size_in_gbs = var.boot_volume_size + boot_volume_vpus_per_gb = 30 } - compute_cluster_id=length(var.compute_cluster_id) > 2 ? var.compute_cluster_id : oci_core_compute_cluster.compute_cluster[0].id + compute_cluster_id = length(var.compute_cluster_id) > 2 ? var.compute_cluster_id : oci_core_compute_cluster.compute_cluster[0].id create_vnic_details { - subnet_id = local.subnet_id + subnet_id = local.subnet_id assign_public_ip = false } } \ No newline at end of file diff --git a/conf/queues.conf.example b/conf/queues.conf.example index fe187437..642a4900 100644 --- a/conf/queues.conf.example +++ b/conf/queues.conf.example @@ -5,8 +5,9 @@ instance_types: - name: HPC default: true - shape: BM.HPC2.36 - instance_keyword: hpc + shape: BM.GPU.H100.8 + change_hostname: false + hostname_convention: HPC # Will add -INDEX with the index of the IP in the subnet permanent: False cluster_network: true compute_cluster: true @@ -28,8 +29,9 @@ hyperthreading: true - name: permanent default: true - shape: BM.HPC2.36 - instance_keyword: permanent + shape: BM.GPU.H100.8 + change_hostname: false + hostname_convention: HPC # Will add -INDEX with the index of the IP in the subnet permanent: true cluster_network: true compute_cluster: true @@ -54,8 +56,9 @@ instance_types: - name: p100 default: true - shape: VM.GPU2.1 - instance_keyword: gpu + shape: BM.GPU.H100.8 + change_hostname: false + hostname_convention: p100 # Will add -INDEX with the index of the IP in the subnet permanent: False cluster_network: false max_number_nodes: 4 @@ -76,8 +79,9 @@ hyperthreading: true - name: amdflex default: false - shape: VM.Standard.E3.Flex - instance_keyword: amd + shape: VM.Standard.E5.Flex + change_hostname: false + hostname_convention: amdflex # Will add -INDEX with the index of the IP in the subnet permanent: false cluster_network: false max_number_nodes: 100 diff --git a/conf/variables.tpl b/conf/variables.tpl index d8b0047f..c4150164 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -53,12 +53,10 @@ variable "marketplace_listing" { variable "marketplace_version_id" { type = map(string) default = { - "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0" - "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0" - "GPU_OL7_CUDA12.2" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0" - "GPU_OL8_CUDA12.2" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0" - "GPU_OL7_CUDA12.4" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.4-2024.03.15-0" - "GPU_OL8_CUDA12.4" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.4-2024.03.15-0" + "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.09.18-0" + "GPU_OL8_NV560" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-560-CUDA-12.6-2024.09.18-0" + "GPU_OL8_NV550" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-550-CUDA-12.4-2024.09.18-0" + "GPU_OL8_NV535" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.09.18-0" } } @@ -92,6 +90,8 @@ variable "backup_name" {default = "${backup_name}"} variable "backup_ip" {default = "${backup_ip}"} variable "login_name" {default = "${login_name}"} variable "login_ip" {default = "${login_ip}"} +variable "monitoring_name" {default = "${monitoring_name}"} +variable "monitoring_ip" {default = "${monitoring_ip}"} variable "scripts_folder" {default = "/opt/oci-hpc/bin/"} variable "autoscaling_folder" {default = "/opt/oci-hpc/autoscaling/"} variable "cluster_block_volume_size" {default="${cluster_block_volume_size}"} @@ -120,7 +120,7 @@ variable "hyperthreading" { default = ##HT## } variable "unsupported" { default = ${unsupported} } variable "image_ocid" { default = "##IMAGE##" } variable "ldap" { default = ${ldap} } -variable "monitoring" { default = ${monitoring} } +variable "cluster_monitoring" { default = ${cluster_monitoring} } variable "autoscaling_monitoring" { default = ${autoscaling_monitoring} } @@ -156,4 +156,10 @@ variable "numa_nodes_per_socket" { } variable "percentage_of_cores_enabled" { default = "${percentage_of_cores_enabled}" +} +variable "change_hostname" { + default = ##CH_HOST## +} +variable "hostname_convention" { + default = "##HOST_CONV##" } \ No newline at end of file diff --git a/controller.tf b/controller.tf index ad45196e..a2573634 100644 --- a/controller.tf +++ b/controller.tf @@ -1,44 +1,44 @@ -resource "oci_core_volume" "controller_volume" { - count = var.controller_block ? 1 : 0 +resource "oci_core_volume" "controller_volume" { + count = var.controller_block ? 1 : 0 availability_domain = var.controller_ad - compartment_id = var.targetCompartment - display_name = "${local.cluster_name}-controller-volume" - + compartment_id = var.targetCompartment + display_name = "${local.cluster_name}-controller-volume" + size_in_gbs = var.controller_block_volume_size vpus_per_gb = split(".", var.controller_block_volume_performance)[0] -} +} -resource "oci_core_volume_attachment" "controller_volume_attachment" { - count = var.controller_block ? 1 : 0 +resource "oci_core_volume_attachment" "controller_volume_attachment" { + count = var.controller_block ? 1 : 0 attachment_type = "iscsi" volume_id = oci_core_volume.controller_volume[0].id instance_id = oci_core_instance.controller.id display_name = "${local.cluster_name}-controller-volume-attachment" device = "/dev/oracleoci/oraclevdb" is_shareable = true -} +} resource "oci_core_volume_backup_policy" "controller_boot_volume_backup_policy" { - count = var.controller_boot_volume_backup ? 1 : 0 - compartment_id = var.targetCompartment - display_name = "${local.cluster_name}-controller_boot_volume_daily" - schedules { - backup_type = var.controller_boot_volume_backup_type - period = var.controller_boot_volume_backup_period - retention_seconds = var.controller_boot_volume_backup_retention_seconds - time_zone = var.controller_boot_volume_backup_time_zone - } + count = var.controller_boot_volume_backup ? 1 : 0 + compartment_id = var.targetCompartment + display_name = "${local.cluster_name}-controller_boot_volume_daily" + schedules { + backup_type = var.controller_boot_volume_backup_type + period = var.controller_boot_volume_backup_period + retention_seconds = var.controller_boot_volume_backup_retention_seconds + time_zone = var.controller_boot_volume_backup_time_zone + } } resource "oci_core_volume_backup_policy_assignment" "boot_volume_backup_policy" { - count = var.controller_boot_volume_backup ? 1 : 0 + count = var.controller_boot_volume_backup ? 1 : 0 depends_on = [oci_core_volume_backup_policy.controller_boot_volume_backup_policy] - asset_id = oci_core_instance.controller.boot_volume_id - policy_id = oci_core_volume_backup_policy.controller_boot_volume_backup_policy[0].id + asset_id = oci_core_instance.controller.boot_volume_id + policy_id = oci_core_volume_backup_policy.controller_boot_volume_backup_policy[0].id } resource "oci_resourcemanager_private_endpoint" "rms_private_endpoint" { - count = var.private_deployment ? 1 : 0 + count = var.private_deployment ? 1 : 0 compartment_id = var.targetCompartment display_name = "rms_private_endpoint" description = "rms_private_endpoint_description" @@ -46,11 +46,11 @@ resource "oci_resourcemanager_private_endpoint" "rms_private_endpoint" { subnet_id = local.subnet_id } -resource "null_resource" "boot_volume_backup_policy" { - depends_on = [oci_core_instance.controller, oci_core_volume_backup_policy.controller_boot_volume_backup_policy, oci_core_volume_backup_policy_assignment.boot_volume_backup_policy] - triggers = { +resource "null_resource" "boot_volume_backup_policy" { + depends_on = [oci_core_instance.controller, oci_core_volume_backup_policy.controller_boot_volume_backup_policy, oci_core_volume_backup_policy_assignment.boot_volume_backup_policy] + triggers = { controller = oci_core_instance.controller.id - } + } } resource "oci_core_instance" "controller" { @@ -61,18 +61,18 @@ resource "oci_core_instance" "controller" { dynamic "shape_config" { for_each = local.is_controller_flex_shape - content { - ocpus = shape_config.value - memory_in_gbs = var.controller_custom_memory ? var.controller_memory : 16 * shape_config.value - } + content { + ocpus = shape_config.value + memory_in_gbs = var.controller_custom_memory ? var.controller_memory : 16 * shape_config.value + } } agent_config { is_management_disabled = true - } - display_name = "${local.cluster_name}-controller" + } + display_name = "${local.cluster_name}-controller" freeform_tags = { - "cluster_name" = local.cluster_name + "cluster_name" = local.cluster_name "parent_cluster" = local.cluster_name } @@ -81,32 +81,33 @@ resource "oci_core_instance" "controller" { user_data = base64encode(data.template_file.controller_config.rendered) } source_details { -// source_id = var.use_standard_image ? data.oci_core_images.linux.images.0.id : local.custom_controller_image_ocid - source_id = local.controller_image + // source_id = var.use_standard_image ? data.oci_core_images.linux.images.0.id : local.custom_controller_image_ocid + source_id = local.controller_image boot_volume_size_in_gbs = var.controller_boot_volume_size - source_type = "image" + boot_volume_vpus_per_gb = 30 + source_type = "image" } create_vnic_details { - subnet_id = local.controller_subnet_id + subnet_id = local.controller_subnet_id assign_public_ip = local.controller_bool_ip } -} +} -resource "null_resource" "controller" { - depends_on = [oci_core_instance.controller, oci_core_volume_attachment.controller_volume_attachment ] - triggers = { +resource "null_resource" "controller" { + depends_on = [oci_core_instance.controller, oci_core_volume_attachment.controller_volume_attachment] + triggers = { controller = oci_core_instance.controller.id - } + } provisioner "remote-exec" { inline = [ "#!/bin/bash", - "sudo mkdir -p /opt/oci-hpc", + "sudo mkdir -p /opt/oci-hpc", "sudo chown ${var.controller_username}:${var.controller_username} /opt/oci-hpc/", "mkdir -p /opt/oci-hpc/bin", "mkdir -p /opt/oci-hpc/playbooks" - ] + ] connection { host = local.host type = "ssh" @@ -115,8 +116,8 @@ resource "null_resource" "controller" { } } provisioner "file" { - source = "playbooks" - destination = "/opt/oci-hpc/" + source = "playbooks" + destination = "/opt/oci-hpc/" connection { host = local.host type = "ssh" @@ -157,7 +158,7 @@ resource "null_resource" "controller" { private_key = tls_private_key.ssh.private_key_pem } } - provisioner "file" { + provisioner "file" { source = "logs" destination = "/opt/oci-hpc/" connection { @@ -187,11 +188,11 @@ resource "null_resource" "controller" { private_key = tls_private_key.ssh.private_key_pem } } - provisioner "file" { - content = templatefile("${path.module}/configure.tpl", { + provisioner "file" { + content = templatefile("${path.module}/configure.tpl", { configure = var.configure }) - destination = "/tmp/configure.conf" + destination = "/tmp/configure.conf" connection { host = local.host type = "ssh" @@ -211,7 +212,7 @@ resource "null_resource" "controller" { } } - provisioner "file" { + provisioner "file" { content = tls_private_key.ssh.public_key_openssh destination = "/home/${var.controller_username}/.ssh/id_rsa.pub" connection { @@ -222,86 +223,90 @@ resource "null_resource" "controller" { } } } -resource "null_resource" "cluster" { - depends_on = [null_resource.controller, null_resource.backup, oci_core_compute_cluster.compute_cluster, oci_core_cluster_network.cluster_network, oci_core_instance.controller, oci_core_volume_attachment.controller_volume_attachment ] - triggers = { +resource "null_resource" "cluster" { + depends_on = [null_resource.controller, null_resource.backup, oci_core_compute_cluster.compute_cluster, oci_core_cluster_network.cluster_network, oci_core_instance.controller, oci_core_volume_attachment.controller_volume_attachment] + triggers = { cluster_instances = join(", ", local.cluster_instances_names) - } + } provisioner "file" { - content = templatefile("${path.module}/inventory.tpl", { - controller_name = oci_core_instance.controller.display_name, - controller_ip = oci_core_instance.controller.private_ip, - backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", - backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", - login_name = var.login_node ? oci_core_instance.login[0].display_name : "", - login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", - compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) - public_subnet = data.oci_core_subnet.public_subnet.cidr_block, - private_subnet = data.oci_core_subnet.private_subnet.cidr_block, - rdma_network = cidrhost(var.rdma_subnet, 0), - rdma_netmask = cidrnetmask(var.rdma_subnet), - zone_name = local.zone_name, - dns_entries = var.dns_entries, - nfs = var.node_count > 0 && var.use_scratch_nfs ? local.cluster_instances_names[0] : "", - home_nfs = var.home_nfs, - create_fss = var.create_fss, - home_fss = var.home_fss, - scratch_nfs = var.use_scratch_nfs && var.node_count > 0, - cluster_nfs = var.use_cluster_nfs, - cluster_nfs_path = var.cluster_nfs_path, - scratch_nfs_path = var.scratch_nfs_path, - add_nfs = var.add_nfs, - nfs_target_path = var.nfs_target_path, - nfs_source_IP = local.nfs_source_IP, - nfs_source_path = var.nfs_source_path, - nfs_options = var.nfs_options, - localdisk = var.localdisk, - log_vol = var.log_vol, - redundancy = var.redundancy, - cluster_network = var.cluster_network, - use_compute_agent = var.use_compute_agent, - slurm = var.slurm, - rack_aware = var.rack_aware, - slurm_nfs_path = var.slurm_nfs ? var.nfs_source_path : var.cluster_nfs_path - spack = var.spack, - ldap = var.ldap, - controller_block = var.controller_block, - login_block = var.login_block, - scratch_nfs_type = local.scratch_nfs_type, - controller_mount_ip = local.controller_mount_ip, - login_mount_ip = local.login_mount_ip, - cluster_mount_ip = local.mount_ip, - autoscaling = var.autoscaling, - cluster_name = local.cluster_name, - shape = local.shape, - instance_pool_ocpus = local.instance_pool_ocpus, - queue=var.queue, - monitoring = var.monitoring, - hyperthreading = var.hyperthreading, - controller_username = var.controller_username, - compute_username = var.compute_username, - autoscaling_monitoring = var.autoscaling_monitoring, + content = templatefile("${path.module}/inventory.tpl", { + controller_name = oci_core_instance.controller.display_name, + controller_ip = oci_core_instance.controller.private_ip, + backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", + backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip : "", + login_name = var.login_node ? oci_core_instance.login[0].display_name : "", + login_ip = var.login_node ? oci_core_instance.login[0].private_ip : "", + monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "", + monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip : "", + compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([], []) + public_subnet = data.oci_core_subnet.public_subnet.cidr_block, + private_subnet = data.oci_core_subnet.private_subnet.cidr_block, + rdma_network = cidrhost(var.rdma_subnet, 0), + rdma_netmask = cidrnetmask(var.rdma_subnet), + zone_name = local.zone_name, + dns_entries = var.dns_entries, + nfs = var.node_count > 0 && var.use_scratch_nfs ? local.cluster_instances_names[0] : "", + home_nfs = var.home_nfs, + create_fss = var.create_fss, + home_fss = var.home_fss, + scratch_nfs = var.use_scratch_nfs && var.node_count > 0, + cluster_nfs = var.use_cluster_nfs, + cluster_nfs_path = var.cluster_nfs_path, + scratch_nfs_path = var.scratch_nfs_path, + add_nfs = var.add_nfs, + nfs_target_path = var.nfs_target_path, + nfs_source_IP = local.nfs_source_IP, + nfs_source_path = var.nfs_source_path, + nfs_options = var.nfs_options, + localdisk = var.localdisk, + log_vol = var.log_vol, + redundancy = var.redundancy, + cluster_network = var.cluster_network, + use_compute_agent = var.use_compute_agent, + slurm = var.slurm, + rack_aware = var.rack_aware, + slurm_nfs_path = var.slurm_nfs ? var.nfs_source_path : var.cluster_nfs_path + spack = var.spack, + ldap = var.ldap, + controller_block = var.controller_block, + login_block = var.login_block, + scratch_nfs_type = local.scratch_nfs_type, + controller_mount_ip = local.controller_mount_ip, + login_mount_ip = local.login_mount_ip, + cluster_mount_ip = local.mount_ip, + autoscaling = var.autoscaling, + cluster_name = local.cluster_name, + shape = local.shape, + instance_pool_ocpus = local.instance_pool_ocpus, + queue = var.queue, + cluster_monitoring = var.cluster_monitoring, + hyperthreading = var.hyperthreading, + controller_username = var.controller_username, + compute_username = var.compute_username, + autoscaling_monitoring = var.autoscaling_monitoring, autoscaling_mysql_service = var.autoscaling_mysql_service, - monitoring_mysql_ip = var.autoscaling_monitoring && var.autoscaling_mysql_service ? oci_mysql_mysql_db_system.monitoring_mysql_db_system[0].ip_address : "localhost", - admin_password = var.admin_password, - admin_username = var.autoscaling_mysql_service ? var.admin_username : "root", - enroot = var.enroot, - pyxis = var.pyxis, - privilege_sudo = var.privilege_sudo, - privilege_group_name = var.privilege_group_name, - latency_check = var.latency_check, - pam = var.pam, - sacct_limits = var.sacct_limits, - inst_prin = var.inst_prin, - region = var.region, - tenancy_ocid = var.tenancy_ocid, - api_fingerprint = var.api_fingerprint, - api_user_ocid = var.api_user_ocid, - healthchecks = var.healthchecks - }) - - destination = "/opt/oci-hpc/playbooks/inventory" + monitoring_mysql_ip = var.autoscaling_monitoring && var.autoscaling_mysql_service ? oci_mysql_mysql_db_system.monitoring_mysql_db_system[0].ip_address : "localhost", + admin_password = var.admin_password, + admin_username = var.autoscaling_mysql_service ? var.admin_username : "root", + enroot = var.enroot, + pyxis = var.pyxis, + privilege_sudo = var.privilege_sudo, + privilege_group_name = var.privilege_group_name, + latency_check = var.latency_check, + pam = var.pam, + sacct_limits = var.sacct_limits, + inst_prin = var.inst_prin, + region = var.region, + tenancy_ocid = var.tenancy_ocid, + api_fingerprint = var.api_fingerprint, + api_user_ocid = var.api_user_ocid, + healthchecks = var.healthchecks, + change_hostname = var.change_hostname, + hostname_convention = var.hostname_convention + }) + + destination = "/opt/oci-hpc/playbooks/inventory" connection { host = local.host type = "ssh" @@ -312,7 +317,7 @@ resource "null_resource" "cluster" { provisioner "file" { - content = var.node_count > 0 ? join("\n",local.cluster_instances_ips) : "\n" + content = var.node_count > 0 ? join("\n", local.cluster_instances_ips) : "\n" destination = "/tmp/hosts" connection { host = local.host @@ -323,14 +328,14 @@ resource "null_resource" "cluster" { } provisioner "file" { - content = templatefile(var.inst_prin ? "${path.module}/autoscaling/provider_inst_prin.tpl" : "${path.module}/autoscaling/provider_user.tpl", { - api_user_ocid = var.api_user_ocid, - api_fingerprint = var.api_fingerprint, + content = templatefile(var.inst_prin ? "${path.module}/autoscaling/provider_inst_prin.tpl" : "${path.module}/autoscaling/provider_user.tpl", { + api_user_ocid = var.api_user_ocid, + api_fingerprint = var.api_fingerprint, private_key_path = "/opt/oci-hpc/autoscaling/credentials/key.pem", - tenancy_ocid = var.tenancy_ocid - }) + tenancy_ocid = var.tenancy_ocid + }) - destination = "/opt/oci-hpc/autoscaling/tf_init/provider.tf" + destination = "/opt/oci-hpc/autoscaling/tf_init/provider.tf" connection { host = local.host type = "ssh" @@ -340,28 +345,31 @@ resource "null_resource" "cluster" { } provisioner "file" { - content = templatefile("${path.module}/queues.conf", { - cluster_network = var.cluster_network, - use_compute_agent = var.use_compute_agent, - compute_cluster = var.compute_cluster, - marketplace_listing = var.marketplace_listing, - image = local.image_ocid, - use_marketplace_image = var.use_marketplace_image, - boot_volume_size = var.boot_volume_size, - shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, - region = var.region, - ad = var.use_multiple_ads? join(" ", [var.ad, var.secondary_ad, var.third_ad]) : var.ad, - private_subnet = data.oci_core_subnet.private_subnet.cidr_block, - private_subnet_id = local.subnet_id, - targetCompartment = var.targetCompartment, - instance_pool_ocpus = local.instance_pool_ocpus, - instance_pool_memory = var.instance_pool_memory, + content = templatefile("${path.module}/queues.conf", { + cluster_network = var.cluster_network, + use_compute_agent = var.use_compute_agent, + compute_cluster = var.compute_cluster, + marketplace_listing = var.marketplace_listing, + image = local.image_ocid, + use_marketplace_image = var.use_marketplace_image, + boot_volume_size = var.boot_volume_size, + shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, + region = var.region, + ad = var.use_multiple_ads ? join(" ", [var.ad, var.secondary_ad, var.third_ad]) : var.ad, + private_subnet = data.oci_core_subnet.private_subnet.cidr_block, + private_subnet_id = local.subnet_id, + targetCompartment = var.targetCompartment, + instance_pool_ocpus = local.instance_pool_ocpus, + instance_pool_memory = var.instance_pool_memory, instance_pool_custom_memory = var.instance_pool_custom_memory, - queue=var.queue, - hyperthreading = var.hyperthreading - }) + queue = var.queue, + hyperthreading = var.hyperthreading, + cluster_name = local.cluster_name, + change_hostname = var.change_hostname, + hostname_convention = var.hostname_convention + }) - destination = "/opt/oci-hpc/conf/queues.conf" + destination = "/opt/oci-hpc/conf/queues.conf" connection { host = local.host type = "ssh" @@ -369,88 +377,90 @@ resource "null_resource" "cluster" { private_key = tls_private_key.ssh.private_key_pem } } - + provisioner "file" { - content = templatefile("${path.module}/conf/variables.tpl", { - controller_name = oci_core_instance.controller.display_name, - controller_ip = oci_core_instance.controller.private_ip, - backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", - backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", - login_name = var.login_node ? oci_core_instance.login[0].display_name : "", - login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", - compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) - public_subnet = data.oci_core_subnet.public_subnet.cidr_block, - public_subnet_id = local.controller_subnet_id, - private_subnet = data.oci_core_subnet.private_subnet.cidr_block, - private_subnet_id = local.subnet_id, - rdma_subnet = var.rdma_subnet, - nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "", - scratch_nfs = var.use_scratch_nfs && var.node_count > 0, - scratch_nfs_path = var.scratch_nfs_path, - use_scratch_nfs = var.use_scratch_nfs, - slurm = var.slurm, - rack_aware = var.rack_aware, - slurm_nfs_path = var.add_nfs ? var.nfs_source_path : var.cluster_nfs_path - spack = var.spack, - ldap = var.ldap, - controller_block = var.controller_block, - login_block = var.login_block, - scratch_nfs_type = local.scratch_nfs_type, - controller_mount_ip = local.controller_mount_ip, - login_mount_ip = local.login_mount_ip, - cluster_mount_ip = local.mount_ip, - scratch_nfs_type_cluster = var.scratch_nfs_type_cluster, - scratch_nfs_type_pool = var.scratch_nfs_type_pool, + content = templatefile("${path.module}/conf/variables.tpl", { + controller_name = oci_core_instance.controller.display_name, + controller_ip = oci_core_instance.controller.private_ip, + backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", + backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip : "", + login_name = var.login_node ? oci_core_instance.login[0].display_name : "", + login_ip = var.login_node ? oci_core_instance.login[0].private_ip : "", + monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "", + monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip : "", + compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([], []) + public_subnet = data.oci_core_subnet.public_subnet.cidr_block, + public_subnet_id = local.controller_subnet_id, + private_subnet = data.oci_core_subnet.private_subnet.cidr_block, + private_subnet_id = local.subnet_id, + rdma_subnet = var.rdma_subnet, + nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "", + scratch_nfs = var.use_scratch_nfs && var.node_count > 0, + scratch_nfs_path = var.scratch_nfs_path, + use_scratch_nfs = var.use_scratch_nfs, + slurm = var.slurm, + rack_aware = var.rack_aware, + slurm_nfs_path = var.add_nfs ? var.nfs_source_path : var.cluster_nfs_path + spack = var.spack, + ldap = var.ldap, + controller_block = var.controller_block, + login_block = var.login_block, + scratch_nfs_type = local.scratch_nfs_type, + controller_mount_ip = local.controller_mount_ip, + login_mount_ip = local.login_mount_ip, + cluster_mount_ip = local.mount_ip, + scratch_nfs_type_cluster = var.scratch_nfs_type_cluster, + scratch_nfs_type_pool = var.scratch_nfs_type_pool, controller_block_volume_performance = var.controller_block_volume_performance, - region = var.region, - tenancy_ocid = var.tenancy_ocid, - vcn_subnet = var.vcn_subnet, - vcn_id = local.vcn_id, - zone_name = local.zone_name, - dns_entries = var.dns_entries, - cluster_block_volume_size = var.cluster_block_volume_size, - cluster_block_volume_performance = var.cluster_block_volume_performance, - ssh_cidr = var.ssh_cidr, - use_cluster_nfs = var.use_cluster_nfs, - cluster_nfs_path = var.cluster_nfs_path, - home_nfs = var.home_nfs, - create_fss = var.create_fss, - home_fss = var.home_fss, - add_nfs = var.add_nfs, - nfs_target_path = var.nfs_target_path, - nfs_source_IP = local.nfs_source_IP, - nfs_source_path = var.nfs_source_path, - nfs_options = var.nfs_options, - localdisk = var.localdisk, - log_vol = var.log_vol, - redundancy = var.redundancy, - monitoring = var.monitoring, - hyperthreading = var.hyperthreading, - unsupported = var.unsupported, - autoscaling_monitoring = var.autoscaling_monitoring, - enroot = var.enroot, - pyxis = var.pyxis, - privilege_sudo = var.privilege_sudo, - privilege_group_name = var.privilege_group_name, - latency_check = var.latency_check, - private_deployment = var.private_deployment, - use_multiple_ads = var.use_multiple_ads, - controller_username = var.controller_username, - compute_username = var.compute_username, - pam = var.pam, - sacct_limits = var.sacct_limits, - use_compute_agent = var.use_compute_agent, - BIOS = var.BIOS, - IOMMU = var.IOMMU, - SMT = var.SMT, - virt_instr = var.virt_instr, - access_ctrl = var.access_ctrl, - numa_nodes_per_socket = var.numa_nodes_per_socket, - percentage_of_cores_enabled = var.percentage_of_cores_enabled, - healthchecks = var.healthchecks - }) - - destination = "/opt/oci-hpc/conf/variables.tf" + region = var.region, + tenancy_ocid = var.tenancy_ocid, + vcn_subnet = var.vcn_subnet, + vcn_id = local.vcn_id, + zone_name = local.zone_name, + dns_entries = var.dns_entries, + cluster_block_volume_size = var.cluster_block_volume_size, + cluster_block_volume_performance = var.cluster_block_volume_performance, + ssh_cidr = var.ssh_cidr, + use_cluster_nfs = var.use_cluster_nfs, + cluster_nfs_path = var.cluster_nfs_path, + home_nfs = var.home_nfs, + create_fss = var.create_fss, + home_fss = var.home_fss, + add_nfs = var.add_nfs, + nfs_target_path = var.nfs_target_path, + nfs_source_IP = local.nfs_source_IP, + nfs_source_path = var.nfs_source_path, + nfs_options = var.nfs_options, + localdisk = var.localdisk, + log_vol = var.log_vol, + redundancy = var.redundancy, + cluster_monitoring = var.cluster_monitoring, + hyperthreading = var.hyperthreading, + unsupported = var.unsupported, + autoscaling_monitoring = var.autoscaling_monitoring, + enroot = var.enroot, + pyxis = var.pyxis, + privilege_sudo = var.privilege_sudo, + privilege_group_name = var.privilege_group_name, + latency_check = var.latency_check, + private_deployment = var.private_deployment, + use_multiple_ads = var.use_multiple_ads, + controller_username = var.controller_username, + compute_username = var.compute_username, + pam = var.pam, + sacct_limits = var.sacct_limits, + use_compute_agent = var.use_compute_agent, + BIOS = var.BIOS, + IOMMU = var.IOMMU, + SMT = var.SMT, + virt_instr = var.virt_instr, + access_ctrl = var.access_ctrl, + numa_nodes_per_socket = var.numa_nodes_per_socket, + percentage_of_cores_enabled = var.percentage_of_cores_enabled, + healthchecks = var.healthchecks + }) + + destination = "/opt/oci-hpc/conf/variables.tf" connection { host = local.host type = "ssh" @@ -459,18 +469,18 @@ resource "null_resource" "cluster" { } } -provisioner "file" { - content = templatefile("${path.module}/initial_mon.tpl", { - cluster_ocid=local.cluster_ocid, - shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, - queue=var.queue, + provisioner "file" { + content = templatefile("${path.module}/initial_mon.tpl", { + cluster_ocid = local.cluster_ocid, + shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, + queue = var.queue, cluster_network = var.cluster_network, - ocids = join(",", local.cluster_instances_ids), - hostnames = join(",", local.cluster_instances_names), - ips = join(",", local.cluster_instances_ips) - }) + ocids = join(",", local.cluster_instances_ids), + hostnames = join(",", local.cluster_instances_names), + ips = join(",", local.cluster_instances_ips) + }) - destination = "/tmp/initial.mon" + destination = "/tmp/initial.mon" connection { host = local.host type = "ssh" @@ -480,7 +490,7 @@ provisioner "file" { } provisioner "file" { content = base64decode(var.api_user_key) - destination = "/opt/oci-hpc/autoscaling/credentials/key.pem" + destination = "/opt/oci-hpc/autoscaling/credentials/key.pem" connection { host = local.host type = "ssh" @@ -500,10 +510,10 @@ provisioner "file" { "chmod 755 /opt/oci-hpc/samples/*.sh", "chmod 600 /opt/oci-hpc/autoscaling/credentials/key.pem", "echo ${var.configure} > /tmp/configure.conf", - "timeout 2h /opt/oci-hpc/bin/configure.sh | tee /opt/oci-hpc/logs/initial_configure.log", + "timeout 2h /opt/oci-hpc/bin/configure.sh 2>&1 | tee /opt/oci-hpc/logs/initial_configure.log", "exit_code=$${PIPESTATUS[0]}", "/opt/oci-hpc/bin/initial_monitoring.sh", - "exit $exit_code" ] + "exit $exit_code"] connection { host = local.host type = "ssh" @@ -514,14 +524,14 @@ provisioner "file" { } data "oci_objectstorage_namespace" "compartment_namespace" { - compartment_id = var.targetCompartment + compartment_id = var.targetCompartment } locals { current_timestamp = timestamp() current_timestamp_formatted = formatdate("YYYYMMDDhhmmss", local.current_timestamp) - rdma_nic_metric_bucket_name = format("%s_%s","RDMA_NIC_metrics",local.current_timestamp_formatted) - par_path = ".." + rdma_nic_metric_bucket_name = format("%s_%s", "RDMA_NIC_metrics", local.current_timestamp_formatted) + par_path = ".." } /* saving the PAR into file: ../PAR_file_for_metrics. @@ -530,7 +540,7 @@ this PAR is used by the scripts to upload NIC metrics to object storage (i.e. sc resource "oci_objectstorage_bucket" "RDMA_NIC_metrics_bucket" { - count = (var.controller_object_storage_par) ? 1 : 0 + count = (var.controller_object_storage_par) ? 1 : 0 compartment_id = var.targetCompartment name = local.rdma_nic_metric_bucket_name namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace @@ -538,10 +548,10 @@ resource "oci_objectstorage_bucket" "RDMA_NIC_metrics_bucket" { } resource "oci_objectstorage_preauthrequest" "RDMA_NIC_metrics_par" { - count = (var.controller_object_storage_par) ? 1 : 0 - depends_on = [oci_objectstorage_bucket.RDMA_NIC_metrics_bucket] - access_type = "AnyObjectWrite" - bucket = local.rdma_nic_metric_bucket_name + count = (var.controller_object_storage_par) ? 1 : 0 + depends_on = [oci_objectstorage_bucket.RDMA_NIC_metrics_bucket] + access_type = "AnyObjectWrite" + bucket = local.rdma_nic_metric_bucket_name name = format("%s-%s", "RDMA_NIC_metrics_bucket", var.tenancy_ocid) namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace time_expires = "2030-08-01T00:00:00+00:00" @@ -549,21 +559,21 @@ resource "oci_objectstorage_preauthrequest" "RDMA_NIC_metrics_par" { output "RDMA_NIC_metrics_url" { - depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par] - value = (var.controller_object_storage_par) ? "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" : "" + depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par] + value = (var.controller_object_storage_par) ? "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" : "" } resource "local_file" "PAR" { - count = (var.controller_object_storage_par) ? 1 : 0 - depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par] - content = "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" - filename = "${local.par_path}/PAR_file_for_metrics" - } + count = (var.controller_object_storage_par) ? 1 : 0 + depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par] + content = "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" + filename = "${local.par_path}/PAR_file_for_metrics" +} resource "oci_dns_rrset" "rrset-controller" { - count = var.dns_entries ? 1 : 0 + count = var.dns_entries ? 1 : 0 zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id domain = "${oci_core_instance.controller.display_name}.${local.zone_name}" rtype = "A" @@ -573,7 +583,7 @@ resource "oci_dns_rrset" "rrset-controller" { rdata = oci_core_instance.controller.private_ip ttl = 3600 } - scope = "PRIVATE" + scope = "PRIVATE" view_id = data.oci_dns_views.dns_views.views[0].id } diff --git a/data.tf b/data.tf index e5dd4277..a36c5fc6 100755 --- a/data.tf +++ b/data.tf @@ -14,19 +14,19 @@ data "oci_core_services" "services" { } } data "oci_core_cluster_network_instances" "cluster_network_instances" { - count = (! var.compute_cluster) && var.cluster_network && var.node_count > 0 ? 1 : 0 + count = (!var.compute_cluster) && var.cluster_network && var.node_count > 0 ? 1 : 0 cluster_network_id = oci_core_cluster_network.cluster_network[0].id compartment_id = var.targetCompartment } data "oci_core_instance_pool_instances" "instance_pool_instances" { - count = ( ! var.cluster_network ) && ( var.node_count > 0 ) ? 1 : 0 + count = (!var.cluster_network) && (var.node_count > 0) ? 1 : 0 instance_pool_id = oci_core_instance_pool.instance_pool[0].id - compartment_id = var.targetCompartment + compartment_id = var.targetCompartment } data "oci_core_instance" "cluster_network_instances" { - count = (! var.compute_cluster) && var.cluster_network && var.node_count > 0 ? var.node_count : 0 + count = (!var.compute_cluster) && var.cluster_network && var.node_count > 0 ? var.node_count : 0 instance_id = data.oci_core_cluster_network_instances.cluster_network_instances[0].instances[count.index]["id"] } @@ -35,60 +35,56 @@ data "oci_core_instance" "instance_pool_instances" { instance_id = data.oci_core_instance_pool_instances.instance_pool_instances[0].instances[count.index]["id"] } -data "oci_core_vcn" "vcn" { +data "oci_core_vcn" "vcn" { vcn_id = local.vcn_id -} -data "oci_core_subnet" "private_subnet" { - subnet_id = local.subnet_id +} +data "oci_core_subnet" "private_subnet" { + subnet_id = local.subnet_id } -data "oci_core_subnet" "public_subnet" { +data "oci_core_subnet" "public_subnet" { subnet_id = local.controller_subnet_id -} - -data "oci_core_images" "linux" { - compartment_id = var.targetCompartment - operating_system = "Oracle Linux" - operating_system_version = "7.9" - filter { - name = "display_name" - values = ["^([a-zA-z]+)-([a-zA-z]+)-([\\.0-9]+)-([\\.0-9-]+)$"] - regex = true - } } data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reachable_ip" { - #Required - count = var.private_deployment ? 1 : 0 - private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id - private_ip = tostring(oci_core_instance.controller.private_ip) + #Required + count = var.private_deployment ? 1 : 0 + private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id + private_ip = tostring(oci_core_instance.controller.private_ip) } data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reachable_ip_backup" { - #Required - count = (var.private_deployment && var.slurm_ha) ? 1 : 0 - private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id - private_ip = tostring(oci_core_instance.backup[0].private_ip) + #Required + count = (var.private_deployment && var.slurm_ha) ? 1 : 0 + private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id + private_ip = tostring(oci_core_instance.backup[0].private_ip) } data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reachable_ip_login" { - #Required - count = (var.private_deployment && var.login_node) ? 1 : 0 - private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id - private_ip = tostring(oci_core_instance.login[0].private_ip) + #Required + count = (var.private_deployment && var.login_node) ? 1 : 0 + private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id + private_ip = tostring(oci_core_instance.login[0].private_ip) +} + +data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reachable_ip_monitoring" { + #Required + count = (var.private_deployment && var.monitoring_node) ? 1 : 0 + private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id + private_ip = tostring(oci_core_instance.monitoring[0].private_ip) } data "oci_dns_views" "dns_views" { - depends_on = [local.controller_subnet, oci_core_vcn.vcn] + depends_on = [local.controller_subnet, oci_core_vcn.vcn] compartment_id = var.targetCompartment - scope = "PRIVATE" - display_name = data.oci_core_vcn.vcn.display_name + scope = "PRIVATE" + display_name = data.oci_core_vcn.vcn.display_name } data "oci_dns_zones" "dns_zones" { - depends_on = [local.controller_subnet, oci_core_vcn.vcn, oci_dns_zone.dns_zone ] - compartment_id = var.targetCompartment - name = local.zone_name - zone_type = "PRIMARY" - scope = "PRIVATE" + depends_on = [local.controller_subnet, oci_core_vcn.vcn, oci_dns_zone.dns_zone] + compartment_id = var.targetCompartment + name = local.zone_name + zone_type = "PRIMARY" + scope = "PRIVATE" } \ No newline at end of file diff --git a/fss.tf b/fss.tf index 1e18216f..88dc9a55 100644 --- a/fss.tf +++ b/fss.tf @@ -1,47 +1,47 @@ resource "oci_file_storage_file_system" "FSS" { - count = var.create_fss ? 1 : 0 - availability_domain = var.fss_ad - compartment_id = var.fss_compartment - display_name = "${local.cluster_name}-fss" - } + count = var.create_fss ? 1 : 0 + availability_domain = var.fss_ad + compartment_id = var.fss_compartment + display_name = "${local.cluster_name}-fss" +} resource "oci_file_storage_file_system" "FSS_home" { - count = var.create_fss && var.home_fss ? 1 : 0 - availability_domain = var.fss_ad - compartment_id = var.fss_compartment - display_name = "${local.cluster_name}-fss-home" - } + count = var.create_fss && var.home_fss ? 1 : 0 + availability_domain = var.fss_ad + compartment_id = var.fss_compartment + display_name = "${local.cluster_name}-fss-home" +} resource "oci_file_storage_mount_target" "FSSMountTarget" { - count = var.create_fss ? 1 : 0 - availability_domain = var.fss_ad + count = var.create_fss ? var.mount_target_count : 0 + availability_domain = var.fss_ad compartment_id = var.fss_compartment subnet_id = local.subnet_id - display_name = "${local.cluster_name}-mt" - hostname_label = "fileserver" + display_name = "${local.cluster_name}-mt-${count.index}" + hostname_label = "fileserver${count.index}" } resource "oci_file_storage_export" "FSSExport" { - count = var.create_fss ? 1 : 0 - export_set_id = oci_file_storage_mount_target.FSSMountTarget.0.export_set_id - file_system_id = oci_file_storage_file_system.FSS.0.id - path = var.nfs_source_path + count = var.create_fss ? var.mount_target_count : 0 + export_set_id = oci_file_storage_mount_target.FSSMountTarget[count.index].export_set_id + file_system_id = oci_file_storage_file_system.FSS[0].id + path = var.nfs_source_path export_options { - source = data.oci_core_vcn.vcn.cidr_block - access = "READ_WRITE" + source = data.oci_core_vcn.vcn.cidr_block + access = "READ_WRITE" identity_squash = "NONE" } } resource "oci_file_storage_export" "FSSExport_home" { - count = var.create_fss && var.home_fss ? 1 : 0 - export_set_id = oci_file_storage_mount_target.FSSMountTarget.0.export_set_id - file_system_id = oci_file_storage_file_system.FSS_home.0.id + count = var.create_fss && var.home_fss ? var.mount_target_count : 0 + export_set_id = oci_file_storage_mount_target.FSSMountTarget[count.index].export_set_id + file_system_id = oci_file_storage_file_system.FSS_home[0].id path = "/home" export_options { - source = data.oci_core_vcn.vcn.cidr_block - access = "READ_WRITE" + source = data.oci_core_vcn.vcn.cidr_block + access = "READ_WRITE" identity_squash = "NONE" } } diff --git a/instance-pool-configuration.tf b/instance-pool-configuration.tf index 1fffd744..0beb34e0 100755 --- a/instance-pool-configuration.tf +++ b/instance-pool-configuration.tf @@ -1,5 +1,5 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" { - count = ( ! var.cluster_network ) && ( var.node_count > 0 ) ? 1 : 0 + count = (!var.cluster_network) && (var.node_count > 0) ? 1 : 0 depends_on = [oci_core_app_catalog_subscription.mp_image_subscription] compartment_id = var.targetCompartment display_name = local.cluster_name @@ -13,11 +13,11 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" { } display_name = local.cluster_name freeform_tags = { - "cluster_name" = local.cluster_name + "cluster_name" = local.cluster_name "parent_cluster" = local.cluster_name } metadata = { -# TODO: add user key to the authorized_keys + # TODO: add user key to the authorized_keys ssh_authorized_keys = "${var.ssh_key}\n${tls_private_key.ssh.public_key_openssh}" user_data = base64encode(data.template_file.config.rendered) } @@ -30,12 +30,12 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" { plugins_config { desired_state = "DISABLED" name = "OS Management Service Agent" - } - dynamic plugins_config { + } + dynamic "plugins_config" { for_each = length(regexall(".*GPU.*", var.instance_pool_shape)) > 0 ? ["ENABLED"] : ["DISABLED"] content { - name = "Compute RDMA GPU Monitoring" - desired_state = plugins_config.value + name = "Compute RDMA GPU Monitoring" + desired_state = plugins_config.value } } } @@ -43,27 +43,28 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" { dynamic "shape_config" { for_each = local.is_instance_pool_flex_shape - content { - ocpus = shape_config.value - memory_in_gbs = var.instance_pool_custom_memory ? var.instance_pool_memory : 16 * shape_config.value - } + content { + ocpus = shape_config.value + memory_in_gbs = var.instance_pool_custom_memory ? var.instance_pool_memory : 16 * shape_config.value + } } - + dynamic "platform_config" { for_each = var.BIOS ? range(1) : [] content { - type = local.platform_type - are_virtual_instructions_enabled = var.virt_instr - is_access_control_service_enabled = var.access_ctrl + type = local.platform_type + are_virtual_instructions_enabled = var.virt_instr + is_access_control_service_enabled = var.access_ctrl is_input_output_memory_management_unit_enabled = var.IOMMU - is_symmetric_multi_threading_enabled = var.SMT - numa_nodes_per_socket = var.numa_nodes_per_socket == "Default" ? (local.platform_type == "GENERIC_BM" ? "NPS1": "NPS4" ): var.numa_nodes_per_socket - percentage_of_cores_enabled = var.percentage_of_cores_enabled == "Default" ? 100 : tonumber(var.percentage_of_cores_enabled) + is_symmetric_multi_threading_enabled = var.SMT + numa_nodes_per_socket = var.numa_nodes_per_socket == "Default" ? (local.platform_type == "GENERIC_BM" ? "NPS1" : "NPS4") : var.numa_nodes_per_socket + percentage_of_cores_enabled = var.percentage_of_cores_enabled == "Default" ? 100 : tonumber(var.percentage_of_cores_enabled) } } source_details { source_type = "image" boot_volume_size_in_gbs = var.boot_volume_size + boot_volume_vpus_per_gb = 30 image_id = local.instance_pool_image } } diff --git a/instance-pool.tf b/instance-pool.tf index 40a9a46e..3cbde0ac 100755 --- a/instance-pool.tf +++ b/instance-pool.tf @@ -1,33 +1,33 @@ -resource "oci_core_volume" "nfs-instance-pool-volume" { - count = var.scratch_nfs_type_pool == "block" && var.node_count > 0 ? 1 : 0 +resource "oci_core_volume" "nfs-instance-pool-volume" { + count = var.scratch_nfs_type_pool == "block" && var.node_count > 0 ? 1 : 0 availability_domain = var.ad - compartment_id = var.targetCompartment - display_name = "${local.cluster_name}-nfs-volume" - + compartment_id = var.targetCompartment + display_name = "${local.cluster_name}-nfs-volume" + size_in_gbs = var.cluster_block_volume_size vpus_per_gb = split(".", var.cluster_block_volume_performance)[0] -} +} -resource "oci_core_volume_attachment" "instance_pool_volume_attachment" { - count = var.scratch_nfs_type_pool == "block" && var.node_count > 0 ? 1 : 0 +resource "oci_core_volume_attachment" "instance_pool_volume_attachment" { + count = var.scratch_nfs_type_pool == "block" && var.node_count > 0 ? 1 : 0 attachment_type = "iscsi" volume_id = oci_core_volume.nfs-instance-pool-volume[0].id instance_id = local.cluster_instances_ids[0] display_name = "${local.cluster_name}-instance-pool-volume-attachment" device = "/dev/oracleoci/oraclevdb" -} +} resource "oci_core_instance_pool" "instance_pool" { - count = ( ! var.cluster_network ) && ( var.node_count > 0 ) ? 1 : 0 - depends_on = [oci_core_app_catalog_subscription.mp_image_subscription, oci_core_subnet.private-subnet, oci_core_subnet.public-subnet] - compartment_id = var.targetCompartment + count = (!var.cluster_network) && (var.node_count > 0) ? 1 : 0 + depends_on = [oci_core_app_catalog_subscription.mp_image_subscription, oci_core_subnet.private-subnet, oci_core_subnet.public-subnet] + compartment_id = var.targetCompartment instance_configuration_id = oci_core_instance_configuration.instance_pool_configuration[0].id size = var.node_count display_name = local.cluster_name freeform_tags = { - "cluster_name" = local.cluster_name - "parent_cluster" = local.cluster_name + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name } placement_configurations { availability_domain = var.ad diff --git a/inventory.tpl b/inventory.tpl index c0824ecd..681cf159 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -4,6 +4,8 @@ ${controller_name} ansible_host=${controller_ip} ansible_user=${controller_usern %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${compute_username} role=controller%{ endif } [login] %{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif } +[monitoring] +%{ if monitoring_name != "" }${monitoring_name} ansible_host=${monitoring_ip} ansible_user=${compute_username} role=monitoring%{ endif } [compute_to_add] [compute_configured] %{ for host, ip in compute ~} @@ -53,7 +55,7 @@ redundancy=${redundancy} log_vol=${log_vol} instance_pool_ocpus=${instance_pool_ocpus} queue=${queue} -monitoring=${monitoring} +cluster_monitoring=${cluster_monitoring} hyperthreading=${hyperthreading} ldap=${ldap} autoscaling_monitoring=${autoscaling_monitoring} @@ -61,7 +63,7 @@ autoscaling_mysql_service=${autoscaling_mysql_service} monitoring_mysql_ip=${monitoring_mysql_ip} admin_password = ${admin_password} admin_username = ${admin_username} -instance_type=permanent +instance_type=${cluster_name} enroot=${enroot} pyxis=${pyxis} pam=${pam} @@ -79,4 +81,6 @@ sacct_limits=${sacct_limits} use_compute_agent=${use_compute_agent} zone_name=${zone_name} dns_entries=${dns_entries} -healthchecks=${healthchecks} \ No newline at end of file +healthchecks=${healthchecks} +change_hostname=${change_hostname} +hostname_convention=${hostname_convention} \ No newline at end of file diff --git a/locals.tf b/locals.tf index f87a3b68..cbcd8d60 100755 --- a/locals.tf +++ b/locals.tf @@ -1,74 +1,84 @@ -locals { -// display names of instances - cluster_instances_ids = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.id : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id +locals { + // display names of instances + cluster_instances_ids = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.id : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id cluster_instances_names = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.display_name : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.display_name : data.oci_core_instance.instance_pool_instances.*.display_name - image_ocid = var.unsupported ? var.image_ocid : var.image + image_ocid = var.unsupported ? var.image_ocid : var.image custom_controller_image_ocid = var.unsupported_controller ? var.unsupported_controller_image : var.custom_controller_image - custom_login_image_ocid = var.unsupported_login ? var.unsupported_login_image : var.custom_login_image - - shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape - instance_pool_ocpus = ( local.shape == "VM.DenseIO.E4.Flex" || local.shape == "VM.DenseIO.E5.Flex" ) ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus - controller_ocpus = ( var.controller_shape == "VM.DenseIO.E4.Flex" || var.controller_shape == "VM.DenseIO.E5.Flex" ) ? var.controller_ocpus_denseIO_flex : var.controller_ocpus - login_ocpus = ( var.login_shape == "VM.DenseIO.E4.Flex" || var.login_shape == "VM.DenseIO.E5.Flex" ) ? var.login_ocpus_denseIO_flex : var.login_ocpus -// ips of the instances - cluster_instances_ips = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.private_ip : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip - first_vcn_ip = cidrhost(data.oci_core_subnet.private_subnet.cidr_block,0) - cluster_instances_ips_index = [for ip in local.cluster_instances_ips : tostring((tonumber(split(".",ip)[3])-tonumber(split(".",local.first_vcn_ip)[3]))+256*(tonumber(split(".",ip)[2])-tonumber(split(".",local.first_vcn_ip)[2]))+1)] - -// vcn id derived either from created vcn or existing if specified + custom_login_image_ocid = var.unsupported_login ? var.unsupported_login_image : var.custom_login_image + custom_monitoring_image_ocid = var.unsupported_monitoring ? var.unsupported_monitoring_image : var.custom_monitoring_image + + + shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape + instance_pool_ocpus = (local.shape == "VM.DenseIO.E4.Flex" || local.shape == "VM.DenseIO.E5.Flex") ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus + controller_ocpus = (var.controller_shape == "VM.DenseIO.E4.Flex" || var.controller_shape == "VM.DenseIO.E5.Flex") ? var.controller_ocpus_denseIO_flex : var.controller_ocpus + login_ocpus = (var.login_shape == "VM.DenseIO.E4.Flex" || var.login_shape == "VM.DenseIO.E5.Flex") ? var.login_ocpus_denseIO_flex : var.login_ocpus + monitoring_ocpus = (var.monitoring_shape == "VM.DenseIO.E4.Flex" || var.monitoring_shape == "VM.DenseIO.E5.Flex") ? var.monitoring_ocpus_denseIO_flex : var.monitoring_ocpus + // ips of the instances + cluster_instances_ips = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.private_ip : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip + first_vcn_ip = cidrhost(data.oci_core_subnet.private_subnet.cidr_block, 0) + cluster_instances_ips_index = [for ip in local.cluster_instances_ips : tostring((tonumber(split(".", ip)[3]) - tonumber(split(".", local.first_vcn_ip)[3])) + 256 * (tonumber(split(".", ip)[2]) - tonumber(split(".", local.first_vcn_ip)[2])) + 1)] + + // vcn id derived either from created vcn or existing if specified vcn_id = var.use_existing_vcn ? var.vcn_id : element(concat(oci_core_vcn.vcn.*.id, [""]), 0) -// subnet id derived either from created subnet or existing if specified -// subnet_id = var.use_existing_vcn ? var.private_subnet_id : element(concat(oci_core_subnet.private-subnet.*.id, [""]), 0) + // subnet id derived either from created subnet or existing if specified + // subnet_id = var.use_existing_vcn ? var.private_subnet_id : element(concat(oci_core_subnet.private-subnet.*.id, [""]), 0) subnet_id = var.private_deployment ? var.use_existing_vcn ? var.private_subnet_id : element(concat(oci_core_subnet.private-subnet.*.id, [""]), 1) : var.use_existing_vcn ? var.private_subnet_id : element(concat(oci_core_subnet.private-subnet.*.id, [""]), 0) - nfs_source_IP = var.create_fss ? element(concat(oci_file_storage_mount_target.FSSMountTarget.*.ip_address, [""]), 0) : var.nfs_source_IP -// subnet id derived either from created subnet or existing if specified -// controller_subnet_id = var.use_existing_vcn ? var.public_subnet_id : element(concat(oci_core_subnet.public-subnet.*.id, [""]), 0) + nfs_source_IP = var.create_fss ? oci_dns_rrset.fss-dns-round-robin[0].domain : var.nfs_source_IP + nfs_list_of_mount_target_IPs = var.create_fss ? "[\"${join("\",\"",oci_file_storage_mount_target.FSSMountTarget.*.ip_address)}\"]" : var.nfs_source_IP + + // subnet id derived either from created subnet or existing if specified + // controller_subnet_id = var.use_existing_vcn ? var.public_subnet_id : element(concat(oci_core_subnet.public-subnet.*.id, [""]), 0) controller_subnet_id = var.private_deployment ? var.use_existing_vcn ? var.public_subnet_id : element(concat(oci_core_subnet.private-subnet.*.id, [""]), 0) : var.use_existing_vcn ? var.public_subnet_id : element(concat(oci_core_subnet.public-subnet.*.id, [""]), 0) - + cluster_name = var.use_custom_name ? var.cluster_name : random_pet.name.id controller_image = var.use_marketplace_image_controller ? oci_core_app_catalog_subscription.controller_mp_image_subscription[0].listing_resource_id : local.custom_controller_image_ocid - login_image = var.login_node && var.use_marketplace_image_login ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid + login_image = var.login_node && var.use_marketplace_image_login ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid + + monitoring_image = var.monitoring_node && var.use_marketplace_image_monitoring ? oci_core_app_catalog_subscription.monitoring_mp_image_subscription[0].listing_resource_id : local.custom_monitoring_image_ocid cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid - instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid + instance_pool_image = !var.cluster_network && var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid + + // image = (var.cluster_network && var.use_marketplace_image == true) || (var.cluster_network == false && var.use_marketplace_image == false) ? var.image : data.oci_core_images.linux.images.0.id -// image = (var.cluster_network && var.use_marketplace_image == true) || (var.cluster_network == false && var.use_marketplace_image == false) ? var.image : data.oci_core_images.linux.images.0.id + is_controller_flex_shape = length(regexall(".*VM.*.*Flex$", var.controller_shape)) > 0 ? [local.controller_ocpus] : [] + is_login_flex_shape = length(regexall(".*VM.*.*Flex$", var.login_shape)) > 0 ? [local.login_ocpus] : [] + is_monitoring_flex_shape = length(regexall(".*VM.*.*Flex$", var.monitoring_shape)) > 0 ? [local.monitoring_ocpus] : [] - is_controller_flex_shape = length(regexall(".*VM.*.*Flex$", var.controller_shape)) > 0 ? [local.controller_ocpus]:[] - is_login_flex_shape = length(regexall(".*VM.*.*Flex$", var.login_shape)) > 0 ? [local.login_ocpus]:[] + is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus] : [] - is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus]:[] - controller_mount_ip = var.controller_block ? element(concat(oci_core_volume_attachment.controller_volume_attachment.*.ipv4, [""]), 0) : "none" - login_mount_ip = var.login_block ? element(concat(oci_core_volume_attachment.login_volume_attachment.*.ipv4, [""]), 0) : "none" + login_mount_ip = var.login_block ? element(concat(oci_core_volume_attachment.login_volume_attachment.*.ipv4, [""]), 0) : "none" - scratch_nfs_type = var.cluster_network ? var.scratch_nfs_type_cluster : var.scratch_nfs_type_pool + scratch_nfs_type = var.cluster_network ? var.scratch_nfs_type_cluster : var.scratch_nfs_type_pool iscsi_ip = var.cluster_network ? element(concat(oci_core_volume_attachment.cluster_network_volume_attachment.*.ipv4, [""]), 0) : element(concat(oci_core_volume_attachment.instance_pool_volume_attachment.*.ipv4, [""]), 0) - mount_ip = local.scratch_nfs_type == "block" ? local.iscsi_ip : "none" + mount_ip = local.scratch_nfs_type == "block" ? local.iscsi_ip : "none" -// Cluster OCID + // Cluster OCID - cluster_ocid = var.node_count > 0 ? var.compute_cluster ? oci_core_compute_cluster.compute_cluster[0].id : var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id : "" - host = var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip[0].ip_address : oci_core_instance.controller.public_ip - controller_bool_ip = var.private_deployment ? false : true - login_bool_ip = var.private_deployment ? false : true - controller_subnet = var.private_deployment ? oci_core_subnet.private-subnet : oci_core_subnet.public-subnet + cluster_ocid = var.node_count > 0 ? var.compute_cluster ? oci_core_compute_cluster.compute_cluster[0].id : var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id : "" + host = var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip[0].ip_address : oci_core_instance.controller.public_ip + controller_bool_ip = var.private_deployment ? false : true + login_bool_ip = var.private_deployment ? false : true + monitoring_bool_ip = var.private_deployment ? false : true + controller_subnet = var.private_deployment ? oci_core_subnet.private-subnet : oci_core_subnet.public-subnet private_subnet_cidr = var.private_deployment ? [var.public_subnet, var.private_subnet] : [var.private_subnet] - host_backup = var.slurm_ha ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_backup[0].ip_address : oci_core_instance.backup[0].public_ip : "none" - host_login = var.login_node ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_login[0].ip_address : oci_core_instance.login[0].public_ip : "none" + host_backup = var.slurm_ha ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_backup[0].ip_address : oci_core_instance.backup[0].public_ip : "none" + host_login = var.login_node ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_login[0].ip_address : oci_core_instance.login[0].public_ip : "none" + host_monitoring = var.monitoring_node ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_monitoring[0].ip_address : oci_core_instance.monitoring[0].public_ip : "none" - timeout_per_batch= var.cluster_network ? 30 : 15 - timeout_ip = join("",[ (( var.node_count - ( var.node_count % 20 ) )/20 + 1 ) * local.timeout_per_batch,"m"]) + timeout_per_batch = var.cluster_network ? 30 : 15 + timeout_ip = join("", [((var.node_count - (var.node_count % 20)) / 20 + 1) * local.timeout_per_batch, "m"]) - zone_name = var.use_existing_vcn ? var.zone_name : "${local.cluster_name}.local" - platform_type = local.shape == "BM.GPU4.8" ? "AMD_ROME_BM_GPU" : local.shape == "BM.GPU.B4.8" || local.shape == "BM.GPU.H100.8" || local.shape == "BM.GPU.A100-v2.8" ? "AMD_MILAN_BM_GPU" : local.shape == "BM.Standard.E3.128" ? "AMD_ROME_BM" : local.shape == "BM.Standard.E4.128" || local.shape == "BM.DenseIO.E4.128" ? "AMD_MILAN_BM" : "GENERIC_BM" + zone_name = var.use_existing_vcn ? var.zone_name : "${local.cluster_name}.local" + platform_type = local.shape == "BM.GPU4.8" ? "AMD_ROME_BM_GPU" : local.shape == "BM.GPU.B4.8" || local.shape == "BM.GPU.A100-v2.8" ? "AMD_MILAN_BM_GPU" : local.shape == "BM.Standard.E3.128" ? "AMD_ROME_BM" : local.shape == "BM.Standard.E4.128" || local.shape == "BM.DenseIO.E4.128" ? "AMD_MILAN_BM" : "GENERIC_BM" } diff --git a/login.tf b/login.tf index 1aba036d..c2dde91f 100644 --- a/login.tf +++ b/login.tf @@ -1,24 +1,24 @@ -resource "oci_core_volume" "login_volume" { - count = var.login_block && var.login_node ? 1 : 0 +resource "oci_core_volume" "login_volume" { + count = var.login_block && var.login_node ? 1 : 0 availability_domain = var.login_ad - compartment_id = var.targetCompartment - display_name = "${local.cluster_name}-login" - size_in_gbs = var.login_block_volume_size - vpus_per_gb = split(".", var.login_block_volume_performance)[0] -} + compartment_id = var.targetCompartment + display_name = "${local.cluster_name}-login" + size_in_gbs = var.login_block_volume_size + vpus_per_gb = split(".", var.login_block_volume_performance)[0] +} -resource "oci_core_volume_attachment" "login_volume_attachment" { - count = var.login_block && var.login_node ? 1 : 0 +resource "oci_core_volume_attachment" "login_volume_attachment" { + count = var.login_block && var.login_node ? 1 : 0 attachment_type = "iscsi" volume_id = oci_core_volume.login_volume[0].id instance_id = oci_core_instance.login[0].id display_name = "${local.cluster_name}-login-volume-attachment" device = "/dev/oracleoci/oraclevdb" -} +} resource "oci_core_instance" "login" { - count = var.login_node ? 1 : 0 + count = var.login_node ? 1 : 0 depends_on = [oci_core_subnet.public-subnet] availability_domain = var.login_ad compartment_id = var.targetCompartment @@ -26,18 +26,18 @@ resource "oci_core_instance" "login" { dynamic "shape_config" { for_each = local.is_login_flex_shape - content { - ocpus = shape_config.value - memory_in_gbs = var.login_custom_memory ? var.login_memory : 16 * shape_config.value - } + content { + ocpus = shape_config.value + memory_in_gbs = var.login_custom_memory ? var.login_memory : 16 * shape_config.value + } } agent_config { is_management_disabled = true - } - display_name = "${local.cluster_name}-login" + } + display_name = "${local.cluster_name}-login" freeform_tags = { - "cluster_name" = local.cluster_name + "cluster_name" = local.cluster_name "parent_cluster" = local.cluster_name } @@ -46,29 +46,30 @@ resource "oci_core_instance" "login" { user_data = base64encode(data.template_file.controller_config.rendered) } source_details { -// source_id = var.use_standard_image ? data.oci_core_images.linux.images.0.id : local.custom_controller_image_ocid - source_id = local.login_image + // source_id = var.use_standard_image ? data.oci_core_images.linux.images.0.id : local.custom_controller_image_ocid + source_id = local.login_image boot_volume_size_in_gbs = var.login_boot_volume_size - source_type = "image" + boot_volume_vpus_per_gb = 30 + source_type = "image" } create_vnic_details { - subnet_id = local.controller_subnet_id + subnet_id = local.controller_subnet_id assign_public_ip = local.login_bool_ip } -} +} resource "oci_dns_rrset" "rrset-login" { - count = var.login_node && var.dns_entries ? 1 : 0 + count = var.login_node && var.dns_entries ? 1 : 0 zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id domain = "${var.login_node ? oci_core_instance.login[0].display_name : ""}.${local.zone_name}" rtype = "A" items { domain = "${var.login_node ? oci_core_instance.login[0].display_name : ""}.${local.zone_name}" rtype = "A" - rdata = var.login_node ? oci_core_instance.login[0].private_ip: "" + rdata = var.login_node ? oci_core_instance.login[0].private_ip : "" ttl = 3600 } - scope = "PRIVATE" + scope = "PRIVATE" view_id = data.oci_dns_views.dns_views.views[0].id } \ No newline at end of file diff --git a/marketplace.tf b/marketplace.tf index a735598d..fc4682eb 100755 --- a/marketplace.tf +++ b/marketplace.tf @@ -1,28 +1,13 @@ locals { -// listing_number = split(".", var.marketplace_listing)[0] - mp_listing_id = var.use_marketplace_image ? substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" - mp_controller_listing_id = var.use_marketplace_image_controller ? substr(var.marketplace_listing_controller,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" - mp_login_listing_id = var.use_marketplace_image_login ? substr(var.marketplace_listing_login,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" - mp_version_id = var.marketplace_version_id[var.marketplace_listing] - mp_controller_version_id = var.marketplace_version_id[var.marketplace_listing_controller] - mp_login_version_id = var.marketplace_version_id[var.marketplace_listing_login] -} - -/* -output "debug" { - value = data.oci_core_app_catalog_listing_resource_versions.app_catalog_listing_resource_versions.app_catalog_listing_resource_versions[0].listing_resource_id -} -*/ - -/* -data "oci_core_app_catalog_listing" "app_catalog_listing" { - count = var.use_marketplace_image ? 1 : 0 - listing_id = local.mp_listing_id -} -*/ -data "oci_core_app_catalog_listing_resource_versions" "app_catalog_listing_resource_versions" { - count = var.use_marketplace_image ? 1 : 0 - listing_id = local.mp_listing_id + // listing_number = split(".", var.marketplace_listing)[0] + mp_listing_id = var.use_marketplace_image ? substr(var.marketplace_listing, 0, 3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" + mp_controller_listing_id = var.use_marketplace_image_controller ? substr(var.marketplace_listing_controller, 0, 3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" + mp_login_listing_id = var.use_marketplace_image_login ? substr(var.marketplace_listing_login, 0, 3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" + mp_monitoring_listing_id = var.use_marketplace_image_monitoring ? substr(var.marketplace_listing_monitoring, 0, 3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" + mp_version_id = var.marketplace_version_id[var.marketplace_listing] + mp_controller_version_id = var.marketplace_version_id[var.marketplace_listing_controller] + mp_login_version_id = var.marketplace_version_id[var.marketplace_listing_login] + mp_monitoring_version_id = var.marketplace_version_id[var.marketplace_listing_monitoring] } resource "oci_core_app_catalog_listing_resource_version_agreement" "mp_image_agreement" { @@ -49,12 +34,12 @@ resource "oci_core_app_catalog_subscription" "mp_image_subscription" { } data "oci_core_app_catalog_listing_resource_versions" "controller_app_catalog_listing_resource_versions" { - count = var.use_marketplace_image_controller ? 1 : 0 - listing_id = local.mp_controller_listing_id + count = var.use_marketplace_image_controller ? 1 : 0 + listing_id = local.mp_controller_listing_id } resource "oci_core_app_catalog_listing_resource_version_agreement" "controller_mp_image_agreement" { - count = ( var.use_marketplace_image_controller ) ? 1 : 0 + count = (var.use_marketplace_image_controller) ? 1 : 0 listing_id = local.mp_controller_listing_id listing_resource_version = local.mp_controller_version_id @@ -62,7 +47,7 @@ resource "oci_core_app_catalog_listing_resource_version_agreement" "controller_m } resource "oci_core_app_catalog_subscription" "controller_mp_image_subscription" { - count = ( var.use_marketplace_image_controller ) ? 1 : 0 + count = (var.use_marketplace_image_controller) ? 1 : 0 compartment_id = var.targetCompartment eula_link = oci_core_app_catalog_listing_resource_version_agreement.controller_mp_image_agreement[0].eula_link listing_id = oci_core_app_catalog_listing_resource_version_agreement.controller_mp_image_agreement[0].listing_id @@ -77,20 +62,30 @@ resource "oci_core_app_catalog_subscription" "controller_mp_image_subscription" } data "oci_core_app_catalog_listing_resource_versions" "login_app_catalog_listing_resource_versions" { - count = var.login_node && var.use_marketplace_image_login ? 1 : 0 - listing_id = local.mp_login_listing_id + count = var.login_node && var.use_marketplace_image_login ? 1 : 0 + listing_id = local.mp_login_listing_id +} +data "oci_core_app_catalog_listing_resource_versions" "monitoring_app_catalog_listing_resource_versions" { + count = var.monitoring_node && var.use_marketplace_image_monitoring ? 1 : 0 + listing_id = local.mp_monitoring_listing_id } - resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_image_agreement" { count = var.login_node && var.use_marketplace_image_login ? 1 : 0 listing_id = local.mp_login_listing_id listing_resource_version = local.mp_login_version_id +} +resource "oci_core_app_catalog_listing_resource_version_agreement" "monitoring_mp_image_agreement" { + count = var.monitoring_node && var.use_marketplace_image_monitoring ? 1 : 0 + + listing_id = local.mp_monitoring_listing_id + listing_resource_version = local.mp_monitoring_version_id + } resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" { - count = var.login_node && var.use_marketplace_image_login ? 1 : 0 + count = var.login_node && var.use_marketplace_image_login ? 1 : 0 compartment_id = var.targetCompartment eula_link = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].eula_link listing_id = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].listing_id @@ -103,3 +98,17 @@ resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" { create = "20m" } } +resource "oci_core_app_catalog_subscription" "monitoring_mp_image_subscription" { + count = var.monitoring_node && var.use_marketplace_image_monitoring ? 1 : 0 + compartment_id = var.targetCompartment + eula_link = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].eula_link + listing_id = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].listing_id + listing_resource_version = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].listing_resource_version + oracle_terms_of_use_link = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].oracle_terms_of_use_link + signature = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].signature + time_retrieved = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].time_retrieved + + timeouts { + create = "20m" + } +} diff --git a/monitoring.tf b/monitoring.tf new file mode 100644 index 00000000..889c8adf --- /dev/null +++ b/monitoring.tf @@ -0,0 +1,55 @@ +resource "oci_core_instance" "monitoring" { + count = var.monitoring_node ? 1 : 0 + depends_on = [oci_core_subnet.public-subnet] + availability_domain = var.monitoring_ad + compartment_id = var.targetCompartment + shape = var.monitoring_shape + + dynamic "shape_config" { + for_each = local.is_monitoring_flex_shape + content { + ocpus = shape_config.value + memory_in_gbs = var.monitoring_custom_memory ? var.monitoring_memory : 16 * shape_config.value + } + } + agent_config { + is_management_disabled = true + } + display_name = "${local.cluster_name}-monitoring" + + freeform_tags = { + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name + } + + metadata = { + ssh_authorized_keys = "${var.ssh_key}\n${tls_private_key.ssh.public_key_openssh}" + user_data = base64encode(data.template_file.controller_config.rendered) + } + source_details { + source_id = local.monitoring_image + boot_volume_size_in_gbs = var.monitoring_boot_volume_size + boot_volume_vpus_per_gb = 30 + source_type = "image" + } + + create_vnic_details { + subnet_id = local.controller_subnet_id + assign_public_ip = local.monitoring_bool_ip + } +} + +resource "oci_dns_rrset" "rrset-monitoring" { + count = var.monitoring_node && var.dns_entries ? 1 : 0 + zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id + domain = "${var.monitoring_node ? oci_core_instance.monitoring[0].display_name : ""}.${local.zone_name}" + rtype = "A" + items { + domain = "${var.monitoring_node ? oci_core_instance.monitoring[0].display_name : ""}.${local.zone_name}" + rtype = "A" + rdata = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip : "" + ttl = 3600 + } + scope = "PRIVATE" + view_id = data.oci_dns_views.dns_views.views[0].id +} \ No newline at end of file diff --git a/mysql.tf b/mysql.tf index fd21313c..84f88f2f 100644 --- a/mysql.tf +++ b/mysql.tf @@ -1,16 +1,16 @@ resource "oci_mysql_mysql_db_system" "monitoring_mysql_db_system" { - #Required - count = var.autoscaling_monitoring && var.autoscaling_mysql_service ? 1 : 0 - admin_password = var.admin_password - admin_username = var.admin_username - availability_domain = var.controller_ad - compartment_id = var.targetCompartment - shape_name = var.monitoring_shape_name - subnet_id = local.subnet_id - display_name = "autoscaling_monitoring" - is_highly_available = false - data_storage_size_in_gb= "50" - backup_policy { - is_enabled = false - } + #Required + count = var.autoscaling_monitoring && var.autoscaling_mysql_service ? 1 : 0 + admin_password = var.admin_password + admin_username = var.admin_username + availability_domain = var.controller_ad + compartment_id = var.targetCompartment + shape_name = var.monitoring_shape_name + subnet_id = local.subnet_id + display_name = "autoscaling_monitoring" + is_highly_available = false + data_storage_size_in_gb = "50" + backup_policy { + is_enabled = false + } } \ No newline at end of file diff --git a/network.tf b/network.tf index 6952fd64..0d99b0c5 100755 --- a/network.tf +++ b/network.tf @@ -20,19 +20,19 @@ resource "oci_core_security_list" "internal-security-list" { destination = "0.0.0.0/0" } - ingress_security_rules { + ingress_security_rules { protocol = "1" - source = "0.0.0.0/0" - icmp_options { + source = "0.0.0.0/0" + icmp_options { type = "3" code = "4" } } - ingress_security_rules { + ingress_security_rules { protocol = "1" - source = var.vcn_subnet - icmp_options { + source = var.vcn_subnet + icmp_options { type = "3" } } @@ -65,19 +65,19 @@ resource "oci_core_security_list" "public-security-list" { min = "3000" } } - ingress_security_rules { + ingress_security_rules { protocol = "1" - source = "0.0.0.0/0" - icmp_options { + source = "0.0.0.0/0" + icmp_options { type = "3" code = "4" } } - ingress_security_rules { + ingress_security_rules { protocol = "1" - source = var.vcn_subnet - icmp_options { + source = var.vcn_subnet + icmp_options { type = "3" } } @@ -149,38 +149,38 @@ resource "oci_core_dhcp_options" "cluster_dhcp_options" { count = var.use_existing_vcn ? 0 : 1 compartment_id = var.targetCompartment options { - type = "DomainNameServer" - server_type = "VcnLocalPlusInternet" - } + type = "DomainNameServer" + server_type = "VcnLocalPlusInternet" + } options { - type = "SearchDomain" - search_domain_names = [ "${var.dns_entries? local.zone_name : "cluster.oraclevcn.com"}" ] - } - vcn_id = oci_core_vcn.vcn[0].id + type = "SearchDomain" + search_domain_names = [var.dns_entries ? local.zone_name : "cluster.oraclevcn.com"] + } + vcn_id = oci_core_vcn.vcn[0].id display_name = "${local.cluster_name}_DHCP" } resource "oci_core_subnet" "public-subnet" { - count = (var.use_existing_vcn || var.private_deployment) ? 0 : 1 + count = (var.use_existing_vcn || var.private_deployment) ? 0 : 1 # availability_domain = var.ad - vcn_id = oci_core_vcn.vcn[0].id - compartment_id = var.targetCompartment - cidr_block = trimspace(var.public_subnet) - security_list_ids = [oci_core_security_list.public-security-list[0].id] - dns_label = "public" - display_name = "${local.cluster_name}_public_subnet" - route_table_id = oci_core_route_table.public_route_table[0].id - dhcp_options_id = oci_core_dhcp_options.cluster_dhcp_options[0].id + vcn_id = oci_core_vcn.vcn[0].id + compartment_id = var.targetCompartment + cidr_block = trimspace(var.public_subnet) + security_list_ids = [oci_core_security_list.public-security-list[0].id] + dns_label = "public" + display_name = "${local.cluster_name}_public_subnet" + route_table_id = oci_core_route_table.public_route_table[0].id + dhcp_options_id = oci_core_dhcp_options.cluster_dhcp_options[0].id } resource "oci_core_subnet" "private-subnet" { - count = var.use_existing_vcn ? 0 : var.private_deployment ? 2 : 1 + count = var.use_existing_vcn ? 0 : var.private_deployment ? 2 : 1 # availability_domain = var.ad vcn_id = oci_core_vcn.vcn[0].id compartment_id = var.targetCompartment cidr_block = trimspace(local.private_subnet_cidr[count.index]) security_list_ids = [oci_core_security_list.internal-security-list[0].id] - dns_label = "private${count.index+1}" - display_name = "${local.cluster_name}_private_subnet${count.index+1}" + dns_label = "private${count.index + 1}" + display_name = "${local.cluster_name}_private_subnet${count.index + 1}" prohibit_public_ip_on_vnic = true route_table_id = oci_core_route_table.private_route_table[0].id dhcp_options_id = oci_core_dhcp_options.cluster_dhcp_options[0].id @@ -189,10 +189,10 @@ resource "oci_core_subnet" "private-subnet" { resource "oci_dns_zone" "dns_zone" { count = var.use_existing_vcn ? 0 : 1 compartment_id = var.targetCompartment - name = "${local.cluster_name}.local" #oci_core_dhcp_options.cluster_dhcp_options[0].options.search_domain_names[0] - zone_type = "PRIMARY" - scope = "PRIVATE" - view_id = data.oci_dns_views.dns_views.views[0].id + name = "${local.cluster_name}.local" #oci_core_dhcp_options.cluster_dhcp_options[0].options.search_domain_names[0] + zone_type = "PRIMARY" + scope = "PRIVATE" + view_id = data.oci_dns_views.dns_views.views[0].id } resource "oci_dns_rrset" "rrset-cluster-network-OCI" { @@ -203,25 +203,44 @@ resource "oci_dns_rrset" "rrset-cluster-network-OCI" { items { domain = "${local.cluster_instances_names[tonumber(each.key)]}.${local.zone_name}" rtype = "A" - rdata = "${local.cluster_instances_ips[tonumber(each.key)]}" + rdata = local.cluster_instances_ips[tonumber(each.key)] ttl = 3600 } - scope = "PRIVATE" + scope = "PRIVATE" view_id = data.oci_dns_views.dns_views.views[0].id } resource "oci_dns_rrset" "rrset-cluster-network-SLURM" { - + for_each = var.slurm && var.dns_entries ? toset([for v in range(var.node_count) : tostring(v)]) : [] zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id - domain = "${var.queue}-permanent-${local.cluster_instances_ips_index[tonumber(each.key)]}.${local.zone_name}" + domain = "${var.hostname_convention}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${local.zone_name}" rtype = "A" items { - domain = "${var.queue}-permanent-${local.cluster_instances_ips_index[tonumber(each.key)]}.${local.zone_name}" + domain = "${var.hostname_convention}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${local.zone_name}" rtype = "A" - rdata = "${local.cluster_instances_ips[tonumber(each.key)]}" + rdata = local.cluster_instances_ips[tonumber(each.key)] ttl = 3600 } - scope = "PRIVATE" + scope = "PRIVATE" + view_id = data.oci_dns_views.dns_views.views[0].id +} + +resource "oci_dns_rrset" "fss-dns-round-robin" { + count = var.create_fss && var.dns_entries ? 1 : 0 + zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id + domain = "fss-${var.hostname_convention}.${local.zone_name}" + rtype = "A" + dynamic "items" { + for_each = oci_file_storage_mount_target.FSSMountTarget[*] + iterator = target + content { + domain = "fss-${var.hostname_convention}.${local.zone_name}" + rtype = "A" + rdata = target.value["ip_address"] + ttl = 1 + } + } + scope = "PRIVATE" view_id = data.oci_dns_views.dns_views.views[0].id -} \ No newline at end of file +} diff --git a/oci_images.tf b/oci_images.tf index cf2d2fda..bd88d82a 100644 --- a/oci_images.tf +++ b/oci_images.tf @@ -1,19 +1,19 @@ variable "marketplace_source_images" { type = map(object({ - ocid = string + ocid = string is_pricing_associated = bool - compatible_shapes = set(string) + compatible_shapes = set(string) })) default = { main_mktpl_image = { - ocid = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" + ocid = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" is_pricing_associated = false - compatible_shapes = [] + compatible_shapes = [] } supporting_image = { - ocid = "ocid1.image.oc1..aaaaaaaazeefig7dqaoheiyoufmllolc3tuiv2c4xueecpr33dm3k4xjip3a" + ocid = "ocid1.image.oc1..aaaaaaaazeefig7dqaoheiyoufmllolc3tuiv2c4xueecpr33dm3k4xjip3a" is_pricing_associated = false - compatible_shapes = [] + compatible_shapes = [] } } } diff --git a/outputs.tf b/outputs.tf index af5b5cba..d4770cf1 100755 --- a/outputs.tf +++ b/outputs.tf @@ -12,4 +12,8 @@ output "backup" { output "login" { value = var.login_node ? local.host_login : "No Login Node Defined" +} + +output "monitoring" { + value = var.monitoring_node ? local.host_monitoring : "No Monitoring Node Defined" } \ No newline at end of file diff --git a/playbooks/destroy.yml b/playbooks/destroy.yml index 9f413982..f7ded4a0 100755 --- a/playbooks/destroy.yml +++ b/playbooks/destroy.yml @@ -9,7 +9,7 @@ - include_role: name: slurm when: slurm|default(false)|bool -- hosts: controller, slurm_backup, login +- hosts: controller, slurm_backup, login, monitoring become: true vars: destroy: true diff --git a/playbooks/monitoring.yml b/playbooks/monitoring.yml new file mode 100644 index 00000000..8f1102b9 --- /dev/null +++ b/playbooks/monitoring.yml @@ -0,0 +1,25 @@ +- hosts: all,!monitoring + gather_facts: true + tasks: + - include_role: + name: metrics-exporter + when: cluster_monitoring|default(false)|bool + +- hosts: monitoring + gather_facts: true + tasks: + - include_role: + name: grafana + when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length > 0 ) + +- hosts: controller + tasks: + - include_role: + name: grafana + when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length == 0 ) + +- hosts: controller, monitoring + tasks: + - include_role: + name: prometheus + when: cluster_monitoring|default(false)|bool \ No newline at end of file diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml old mode 100755 new mode 100644 index 39efe46f..5e3adb1b --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -7,7 +7,7 @@ tasks: - include_role: name: hostname - when: slurm | default(false) | bool + when: change_hostname | default(false) | bool # for ubuntu, on all compute nodes, run --fix-broken install - hosts: compute @@ -46,18 +46,12 @@ become: true gather_facts: true tasks: - - include_role: - name: oci-cn-auth - when: cluster_network|bool and not use_compute_agent|default(false)|bool - - include_role: - name: rdma-interface - when: cluster_network|bool and not use_compute_agent|default(false)|bool - include_role: name: nvidia_peermem - include_role: name: healthchecks -- hosts: controller,slurm_backup,login,compute +- hosts: controller,slurm_backup,login,compute, monitoring become: true vars: destroy: false @@ -206,11 +200,11 @@ name: spack when: spack|default(false)|bool - include_role: - name: slurm - when: slurm|default(false)|bool + name: prometheus + when: cluster_monitoring|default(false)|bool - include_role: - name: influxdb - when: monitoring|default(false)|bool + name: metrics-exporter + when: cluster_monitoring|default(false)|bool - include_role: - name: telegraf - when: monitoring|default(false)|bool \ No newline at end of file + name: slurm + when: slurm|default(false)|bool \ No newline at end of file diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml old mode 100755 new mode 100644 index 8a599590..576cbd1f --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -6,7 +6,7 @@ tasks: - include_role: name: hostname - when: slurm | default(false) | bool + when: change_hostname | default(false) | bool # for ubuntu, on all compute nodes, run --fix-broken install - hosts: compute_to_add @@ -44,18 +44,12 @@ become: true gather_facts: true tasks: - - include_role: - name: oci-cn-auth - when: cluster_network|bool and not use_compute_agent|default(false)|bool - - include_role: - name: rdma-interface - when: cluster_network|bool and not use_compute_agent|default(false)|bool - include_role: name: nvidia_peermem - include_role: name: healthchecks -- hosts: controller,slurm_backup,login,compute +- hosts: controller,slurm_backup,login,compute, monitoring become: true vars: destroy: false @@ -199,11 +193,11 @@ name: spack when: spack|default(false)|bool - include_role: - name: slurm - when: slurm|default(false)|bool + name: prometheus + when: cluster_monitoring|default(false)|bool - include_role: - name: influxdb - when: monitoring|default(false)|bool + name: metrics-exporter + when: cluster_monitoring|default(false)|bool - include_role: - name: telegraf - when: monitoring|default(false)|bool \ No newline at end of file + name: slurm + when: slurm|default(false)|bool \ No newline at end of file diff --git a/playbooks/resize_remove.yml b/playbooks/resize_remove.yml index 99029c50..b5d04156 100755 --- a/playbooks/resize_remove.yml +++ b/playbooks/resize_remove.yml @@ -1,4 +1,4 @@ -- hosts: controller, slurm_backup, compute, login +- hosts: controller, slurm_backup, compute, login, monitoring become: true gather_facts: true vars: diff --git a/playbooks/resize_remove_unreachable.yml b/playbooks/resize_remove_unreachable.yml index 5d8f274f..ecc61d35 100644 --- a/playbooks/resize_remove_unreachable.yml +++ b/playbooks/resize_remove_unreachable.yml @@ -1,4 +1,4 @@ -- hosts: controller, compute, slurm_backup, login +- hosts: controller, compute, slurm_backup, login, monitoring become: true gather_facts: true vars: @@ -9,4 +9,11 @@ - "/opt/oci-hpc/conf/queues.conf" tasks: - include_role: - name: destroy_unreachable \ No newline at end of file + name: destroy_unreachable + + +- hosts: controller + become: true + tasks: + - include_role: + name: prometheus \ No newline at end of file diff --git a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml index 64020bc0..3bf2d6c9 100644 --- a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml +++ b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml @@ -53,7 +53,7 @@ - name: install grafana include_role: name: grafana - when: not monitoring|default(false)|bool + when: not cluster_monitoring|default(false)|bool # - name: Import mysql-2022 key # become: true diff --git a/playbooks/roles/cluster-cli/files/cluster b/playbooks/roles/cluster-cli/files/cluster index 4bb4e623..e7aab48b 100755 --- a/playbooks/roles/cluster-cli/files/cluster +++ b/playbooks/roles/cluster-cli/files/cluster @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env /usr/bin/python3 import ldap3 from ldap3 import MODIFY_ADD import click diff --git a/playbooks/roles/destroy_unreachable/tasks/common.yml b/playbooks/roles/destroy_unreachable/tasks/common.yml index 111778da..1d954a32 100644 --- a/playbooks/roles/destroy_unreachable/tasks/common.yml +++ b/playbooks/roles/destroy_unreachable/tasks/common.yml @@ -3,9 +3,8 @@ - name: Get Slurm hostnames vars: - index: "{{ unreachable_nodes | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) }}" - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" set_fact: - unreachable_slurm_nodes: "{{unreachable_slurm_nodes | default([]) + [queue+'-'+keyword+'-node-'+ item | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) ] }}" + unreachable_slurm_nodes: "{{unreachable_slurm_nodes | default([]) + [hostname_convention+'-'+ item | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) ] }}" with_items: "{{unreachable_nodes}}" when: item | ipaddr ignore_unreachable: yes @@ -16,7 +15,7 @@ set_fact: unreachable_slurm_nodes: "{{unreachable_slurm_nodes | default([]) + [item] }}" with_items: "{{unreachable_nodes}}" - when: not ( item | ipaddr ) and '-node-' in item + when: not ( item | ipaddr ) and ( item.split('-')[0] == hostname_convention or (not change_hostname|bool) ) ignore_unreachable: yes delegate_to: 127.0.0.1 run_once: true @@ -25,7 +24,7 @@ set_fact: unreachable_oci_nodes: "{{unreachable_slurm_nodes | default([]) + [item] }}" with_items: "{{unreachable_nodes}}" - when: not ( item | ipaddr ) and not ('-node-' in item ) + when: not ( item | ipaddr ) and item.split('-')[0] != hostname_convention and (change_hostname|bool) ignore_unreachable: yes delegate_to: 127.0.0.1 run_once: true diff --git a/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml b/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml index d1bdc2a9..6013623d 100644 --- a/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml +++ b/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml @@ -190,65 +190,9 @@ run_once: true delegate_to: 127.0.0.1 -- name: get inactiveLine - shell: "scontrol show hostname `scontrol show topology inactive-{{item.split('-node-')[0]}} | rev | cut -d \"=\" -f 1 | rev`" - register: inactive_switch - run_once: true - delegate_to: 127.0.0.1 - with_items: "{{unreachable_slurm_nodes}}" - -- name: get Inactive Nodes - set_fact: - nodes_on_inactive_switch: "{{nodes_on_inactive_switch | default({}) | combine({item.item : item.stdout_lines } ) }}" - with_items: "{{inactive_switch.results}}" - run_once: true - delegate_to: 127.0.0.1 - when: ( item.stdout_lines | length ) > 0 - -- name: get Inactive Nodes - set_fact: - nodes_on_inactive_switch: "{{nodes_on_inactive_switch | default({}) | combine({item.item : [] } ) }}" - with_items: "{{inactive_switch.results}}" - run_once: true - delegate_to: 127.0.0.1 - when: ( item.stdout_lines | length ) == 0 - -- name: generate nodes_on_inactive_switch_condensed - shell: "scontrol show hostlistsorted {{nodes_on_inactive_switch[item] | union([item]) | join(',')}}" - register: inactive_switch_condensed - with_items: "{{unreachable_slurm_nodes}}" - run_once: true - delegate_to: 127.0.0.1 - -- name: get condensed_Nodes - set_fact: - nodes_on_switch_condensed: "{{nodes_on_inactive_switch_condensed | default({}) | combine({item.item : item.stdout } ) }}" - with_items: "{{inactive_switch_condensed.results}}" - run_once: true - delegate_to: 127.0.0.1 - -# - name: debug -# debug: -# msg: "replacing line SwitchName=inactive-{{item.split('-node-')[0]}}\\sNodes.* with SwitchName=inactive-{{item.split('-node-')[0]}} Nodes={{nodes_on_switch_condensed[item]}}" -# with_items: "{{unreachable_slurm_nodes}}" -# ignore_unreachable: yes -# delegate_to: 127.0.0.1 -# run_once: true - -- name: add node to inactive line - lineinfile: - path: "{{ slurm_conf_path }}/topology.conf" - regexp: "SwitchName=inactive-{{item.split('-node-')[0]}}\\sNodes.*" - line: "SwitchName=inactive-{{item.split('-node-')[0]}} Nodes={{nodes_on_switch_condensed[item]}}" - state: present - with_items: "{{unreachable_slurm_nodes}}" - ignore_unreachable: yes - delegate_to: 127.0.0.1 - run_once: true - - name: change Node Status become: true - command: "scontrol update nodename={{ item }} state=future reason=terminating" + command: "scontrol delete nodename={{ item }}" ignore_errors: true ignore_unreachable: True with_items: "{{unreachable_slurm_nodes}}" diff --git a/playbooks/roles/destroy_unreachable/tasks/slurm.yml b/playbooks/roles/destroy_unreachable/tasks/slurm.yml index e97520c7..4d2317cd 100644 --- a/playbooks/roles/destroy_unreachable/tasks/slurm.yml +++ b/playbooks/roles/destroy_unreachable/tasks/slurm.yml @@ -87,58 +87,9 @@ run_once: true delegate_to: 127.0.0.1 -- name: get inactiveLine - shell: "scontrol show hostname `scontrol show topology inactive-{{item.split('-node-')[0]}} | rev | cut -d \"=\" -f 1 | rev`" - register: inactive_switch - run_once: true - delegate_to: 127.0.0.1 - with_items: "{{unreachable_slurm_nodes}}" - -- name: get Inactive Nodes - set_fact: - nodes_on_inactive_switch: "{{nodes_on_inactive_switch | default({}) | combine({item.item : item.stdout_lines } ) }}" - with_items: "{{inactive_switch.results}}" - run_once: true - delegate_to: 127.0.0.1 - when: ( item.stdout_lines | length ) > 0 - -- name: get Inactive Nodes - set_fact: - nodes_on_inactive_switch: "{{nodes_on_inactive_switch | default({}) | combine({item.item : [] } ) }}" - with_items: "{{inactive_switch.results}}" - run_once: true - delegate_to: 127.0.0.1 - when: ( item.stdout_lines | length ) == 0 - -- name: generate nodes_on_inactive_switch_condensed - shell: "scontrol show hostlistsorted {{nodes_on_inactive_switch[item] | union([item]) | join(',')}}" - register: inactive_switch_condensed - with_items: "{{unreachable_slurm_nodes}}" - run_once: true - delegate_to: 127.0.0.1 - -- name: get condensed_Nodes - set_fact: - nodes_on_switch_condensed: "{{nodes_on_inactive_switch_condensed | default({}) | combine({item.item : item.stdout } ) }}" - with_items: "{{inactive_switch_condensed.results}}" - run_once: true - delegate_to: 127.0.0.1 - - -- name: add node to inactive line - lineinfile: - path: "{{ slurm_conf_path }}/topology.conf" - regexp: "SwitchName=inactive-{{item.split('-node-')[0]}}\\sNodes.*" - line: "SwitchName=inactive-{{item.split('-node-')[0]}} Nodes={{nodes_on_switch_condensed[item]}}" - state: present - with_items: "{{unreachable_slurm_nodes}}" - ignore_unreachable: yes - delegate_to: 127.0.0.1 - run_once: true - - name: change Node Status become: true - command: "scontrol update nodename={{ item }} state=future reason=terminating" + command: "scontrol delete nodename={{ item }}" ignore_errors: true ignore_unreachable: True with_items: "{{unreachable_slurm_nodes}}" diff --git a/playbooks/roles/docker/tasks/ubuntu.yml b/playbooks/roles/docker/tasks/ubuntu.yml index 3a652911..72769566 100644 --- a/playbooks/roles/docker/tasks/ubuntu.yml +++ b/playbooks/roles/docker/tasks/ubuntu.yml @@ -38,6 +38,22 @@ name: safe_yum ignore_errors: true + - name: force docker gid 999 + become: true + lineinfile: + path: /etc/group + state: present + regexp: '^docker:x:(.*)$' + line: 'docker:x:999:{{ansible_user}}' + backrefs: yes + + - name: restart docker + become: true + service: + name: docker.socket + state: restarted + enabled: yes + # start is not needed - it starts dokcer as part of install - name: Enable service Docker ansible.builtin.service: @@ -53,8 +69,8 @@ # pip: # name: docker - - name: add ubuntu user to docker group - user: - name: ubuntu - groups: docker - append: yes \ No newline at end of file +# - name: add ubuntu user to docker group +# user: +# name: ubuntu +# groups: docker +# append: yes diff --git a/playbooks/roles/etc-hosts/tasks/common.yml b/playbooks/roles/etc-hosts/tasks/common.yml index 97888dcb..d6d47225 100644 --- a/playbooks/roles/etc-hosts/tasks/common.yml +++ b/playbooks/roles/etc-hosts/tasks/common.yml @@ -52,13 +52,13 @@ run_once: true when: not destroy|bool and groups['compute']|length > 0 -- name: move /etc/hosts on backup slurm and login node +- name: move /etc/hosts on backup slurm, login node and monitoring node become: true copy: dest: /etc/hosts src: /etc/hosts force: yes - when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names)) + when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names) or ('monitoring' in group_names)) - name: Make sure the IP for each node was not left over in another cluster become: true @@ -66,7 +66,7 @@ dest: /etc/hosts regexp: "^127.0.1.1\\s{{hostvars[groups['controller'][0]]['inventory_hostname']}}.*" state: absent - when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names)) + when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names) or ('monitoring' in group_names)) - name: move /etc/hosts on all compute nodes become: true @@ -74,7 +74,7 @@ dest: /etc/hosts src: /tmp/hosts.etc.{{ cluster_name }} force: yes - when: ( not destroy|bool ) and (not 'controller' in group_names) and (not 'slurm_backup' in group_names) and (not 'login' in group_names) + when: ( not destroy|bool ) and (not 'controller' in group_names) and (not 'slurm_backup' in group_names) and (not 'login' in group_names) and (not 'monitoring' in group_names) - name: remove cluster from etc-host become: true @@ -104,4 +104,14 @@ state: absent delegate_to: "{{ groups['login'][0] }}" run_once: true - when: destroy|bool and (groups['login']|length > 0)|bool \ No newline at end of file + when: destroy|bool and (groups['login']|length > 0)|bool + +- name: remove cluster from etc-host on monitoring + become: true + blockinfile: + dest: /etc/hosts + marker: "# {mark} ANSIBLE MANAGED BLOCK {{ cluster_name }}" + state: absent + delegate_to: "{{ groups['monitoring'][0] }}" + run_once: true + when: destroy|bool and (groups['monitoring']|length > 0)|bool \ No newline at end of file diff --git a/playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2 b/playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2 index e604e118..180e46f2 100755 --- a/playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2 +++ b/playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2 @@ -9,4 +9,8 @@ {% for item in groups['login'] %} {% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} {{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} login +{% endfor %} +{% for item in groups['monitoring'] %} +{% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} +{{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} monitoring {% endfor %} \ No newline at end of file diff --git a/playbooks/roles/fix_broken/tasks/ubuntu.yml b/playbooks/roles/fix_broken/tasks/ubuntu.yml index cc8aae1b..556e8560 100644 --- a/playbooks/roles/fix_broken/tasks/ubuntu.yml +++ b/playbooks/roles/fix_broken/tasks/ubuntu.yml @@ -63,6 +63,7 @@ - linux-oracle - linux-headers-oracle - linux-image-oracle + ignore_errors: yes - name: Ensure apt process is completed become: true diff --git a/playbooks/roles/grafana/defaults/main.yml b/playbooks/roles/grafana/defaults/main.yml old mode 100755 new mode 100644 index cbd1b8e9..e59e066a --- a/playbooks/roles/grafana/defaults/main.yml +++ b/playbooks/roles/grafana/defaults/main.yml @@ -1,3 +1,9 @@ +user: "{{ ol_user if ansible_os_family == 'RedHat' else ubuntu_user }}" +ol_user: "opc" +ubuntu_user: "ubuntu" +dashboard_build_dir: "/tmp/dashboard-build" +grafonnet_lib_repo_url: "github.com/grafana/grafonnet-lib/grafonnet@master" +grafonnet_gen_repo_url: "github.com/grafana/grafonnet/gen/grafonnet-latest@main" grafana_api_keys_dir: /etc/opt/oci-hpc/passwords/grafana grafana_api_url: "http://localhost:3000" grafana_security: diff --git a/playbooks/roles/grafana/files/cluster.json b/playbooks/roles/grafana/files/cluster.json old mode 100755 new mode 100644 diff --git a/playbooks/roles/grafana/files/cluster_amd.json b/playbooks/roles/grafana/files/cluster_amd.json new file mode 100644 index 00000000..14449917 --- /dev/null +++ b/playbooks/roles/grafana/files/cluster_amd.json @@ -0,0 +1,5849 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "HPC Cluster dashboard", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 928, + "graphTooltip": 1, + "id": null, + "iteration": 1693307124814, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 64974, + "panels": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "displayMode": "color-text", + "filterable": false, + "inspect": false + }, + "links": [ + { + "title": "Details", + "url": "/d/00000012722/cluster-dashboard-3?orgId=1&var-datasource=InfluxDB&var-inter=10s&var-ncores=All&var-server=${__data.fields.Node}&var-mountpoint=All&var-cpu=All&var-disk=All&var-netif=All&var-gpu=All&from=1692806920934&to=1692817720934&viewPanel=62657 " + } + ], + "mappings": [], + "noValue": "No GPU Detected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "GPU" + }, + "properties": [ + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Node" + }, + "properties": [ + { + "id": "custom.width", + "value": 306 + } + ] + } + ] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 65247, + "options": { + "footer": { + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 0, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "alias": "", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "groupBy": [], + "hide": false, + "measurement": "amd_rocm_smi", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT uuid,host,max(\"temperature_sensor_junction\") FROM \"amd_rocm_smi\" WHERE host =~/$server/ AND $timeFilter", + "rawQuery": true, + "refId": "A", + "resultFormat": "table", + "select": [ + [ + { + "params": [ + "temperature_sensor_junction" + ], + "type": "field" + }, + { + "params": [], + "type": "max" + } + ] + ] + }, + { + "alias": "", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "groupBy": [], + "hide": false, + "measurement": "amd_rocm_smi", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT uuid,host,min(\"temperature_sensor_junction\") FROM \"amd_rocm_smi\" WHERE host =~/$server/ AND $timeFilter;", + "rawQuery": true, + "refId": "B", + "resultFormat": "table", + "select": [ + [ + { + "params": [ + "temperature_sensor_junction" + ], + "type": "field" + }, + { + "params": [], + "type": "max" + } + ] + ] + } + ], + "title": "Temperature: Highest/Lowest", + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": {}, + "renameByName": { + "host": "Node", + "max": "Highest", + "min": "Lowest", + "uuid": "GPU" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "color-text", + "filterable": false, + "inspect": false + }, + "links": [ + { + "title": "utilization", + "url": "/d/00000012722/cluster-dashboard-3?orgId=1&var-datasource=InfluxDB&var-inter=10s&var-ncores=All&var-server=${__data.fields.Node}&var-mountpoint=All&var-cpu=All&var-disk=All&var-netif=All&var-gpu=GPU-04865380-bc86-dfbd-4d46-5d550037acd4&from=1692806920934&to=1692817720934&viewPanel=62521" + } + ], + "mappings": [], + "noValue": "No GPU Detected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "GPU" + }, + "properties": [ + { + "id": "custom.width", + "value": 195 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Node" + }, + "properties": [ + { + "id": "custom.width", + "value": 248 + } + ] + } + ] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 65383, + "links": [], + "options": { + "footer": { + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 2, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "alias": "", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "groupBy": [], + "hide": false, + "measurement": "amd_rocm_smi", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT uuid,host,max(\"utilization_gpu\") FROM \"amd_rocm_smi\" WHERE host =~/$server/ AND $timeFilter;", + "rawQuery": true, + "refId": "A", + "resultFormat": "table", + "select": [ + [ + { + "params": [ + "temperature_sensor_junction" + ], + "type": "field" + }, + { + "params": [], + "type": "max" + } + ] + ] + }, + { + "alias": "", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "groupBy": [], + "hide": false, + "measurement": "amd_rocm_smi", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT uuid,host,min(\"utilization_gpu\") FROM \"amd_rocm_smi\" WHERE host =~/$server/ AND $timeFilter;", + "rawQuery": true, + "refId": "B", + "resultFormat": "table", + "select": [ + [ + { + "params": [ + "temperature_sensor_junction" + ], + "type": "field" + }, + { + "params": [], + "type": "max" + } + ] + ] + } + ], + "title": "Utilization: Highest/Lowest", + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": {}, + "renameByName": { + "host": "Node", + "max": "Highest", + "min": "Lowest", + "uuid": "GPU" + } + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "(.*)-:*", + "renamePattern": "" + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 0, + "y": 5 + }, + "id": 64291, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SHOW TAG VALUES CARDINALITY FROM amd_rocm_smi with KEY=\"uuid\" where uuid =~ /$gpu/ ", + "rawQuery": true, + "refId": "A", + "resultFormat": "table" + } + ], + "title": "# GPU", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "displayMode": "color-text", + "filterable": true, + "inspect": false + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 2, + "y": 5 + }, + "id": 63202, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": false + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SHOW TAG VALUES FROM amd_rocm_smi with KEY=\"name\" where uuid =~ /$gpu/ ", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "GPU", + "type": "table" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "displayMode": "color-text", + "filterable": true, + "inspect": false + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 6, + "y": 5 + }, + "id": 63610, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": false + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SHOW TAG VALUES FROM amd_rocm_smi with KEY=\"pstate\" WHERE (\"uuid\" =~ /^$gpu$/) ", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "P-State", + "type": "table" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 5 + }, + "id": 65110, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"utilization_gpu\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "GPU Utilization %", + "type": "gauge" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 5 + }, + "id": 63747, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"utilization_memory\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Memory Utilization %", + "type": "gauge" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 85 + }, + { + "color": "red", + "value": 95 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 5 + }, + "id": 64019, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"temperature_sensor_junction\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Temperature", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 18, + "y": 5 + }, + "id": 63883, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"fan_speed\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Fan Speed", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 21, + "y": 5 + }, + "id": 64155, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"power_draw\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Power draw", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "displayMode": "color-text", + "filterable": true, + "inspect": false + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 2, + "y": 7 + }, + "id": 63338, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": false + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT last(\"driver_version\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"index\" SLIMIT 1", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Driver Version", + "type": "table" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "displayMode": "color-text", + "filterable": true, + "inspect": false + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 6, + "y": 7 + }, + "id": 63474, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": false + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT last(\"cuda_version\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"index\" SLIMIT 1", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "CUDA Version", + "type": "table" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 9 + }, + "id": 64428, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"pcie_link_width_current\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Current link width", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 9 + }, + "id": 64427, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"pcie_link_gen_current\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Current link generation", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "rothz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 9 + }, + "id": 64837, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"clocks_current_graphics\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Graphics clock frequency", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 15, + "x": 9, + "y": 9 + }, + "hiddenSeries": false, + "id": 62521, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT \"utilization_gpu\" FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"uuid\"", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "GPU Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "(:*).* ", + "renamePattern": "$1" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "(.*)-:*", + "renamePattern": "GPU-" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "}", + "renamePattern": "" + } + } + ], + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:58", + "format": "percent", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:59", + "format": "short", + "logBase": 1, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "rothz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 14 + }, + "id": 64701, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"clocks_current_video\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Video clock frequency", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "rothz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 14 + }, + "id": 64565, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"clocks_current_sm\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "SM clock frequency", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "rothz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 14 + }, + "id": 64564, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"clocks_current_memory\") FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Memory clock frequency", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "unit": "decmbytes" + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 19 + }, + "hiddenSeries": false, + "id": 62793, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT \"memory_total\" FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"uuid\"", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "hide": false, + "query": "SELECT \"memory_used\" FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"uuid\"", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Memory (Total & Used)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "(:*).* ", + "renamePattern": "$1" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "(.*)-:*", + "renamePattern": "GPU-" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "}", + "renamePattern": "" + } + } + ], + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:58", + "format": "decmbytes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:59", + "format": "short", + "logBase": 1, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "unit": "celsius" + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 19 + }, + "hiddenSeries": false, + "id": 62657, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT \"temperature_sensor_junction\" FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"uuid\"", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Temperature", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "(:*).* ", + "renamePattern": "$1" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "(.*)-:*", + "renamePattern": "GPU-" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "}", + "renamePattern": "" + } + } + ], + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:58", + "format": "celsius", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:59", + "format": "short", + "logBase": 1, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 19 + }, + "hiddenSeries": false, + "id": 62929, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT \"power_draw\" FROM \"amd_rocm_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"uuid\"", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Power Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "(:*).* ", + "renamePattern": "$1" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "(.*)-:*", + "renamePattern": "GPU-" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "}", + "renamePattern": "" + } + } + ], + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:58", + "format": "kwatt", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:59", + "format": "short", + "logBase": 1, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "GPU", + "type": "row" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 62045, + "panels": [], + "title": "CPU", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 2 + }, + "height": "300", + "hiddenSeries": false, + "id": 28239, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "hide": false, + "measurement": "cpu_percentageBusy", + "policy": "default", + "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = 'cpu-total' AND $timeFilter GROUP BY time($interval), *", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "CPU usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:506", + "format": "percent", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "$$hashKey": "object:507", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 10 + }, + "height": "350", + "hiddenSeries": false, + "id": 54694, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "system_load1", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(load1) as short,mean(load5) as medium,mean(load15) as long FROM \"system\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), * ORDER BY asc", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Load averages", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:581", + "format": "short", + "logBase": 1, + "min": 0, + "show": true + }, + { + "$$hashKey": "object:582", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 10 + }, + "height": "350", + "hiddenSeries": false, + "id": 61852, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "processes", + "policy": "default", + "query": "SELECT mean(running) as running, mean(blocked) as blocked, mean(sleeping) as sleeping, mean(stopped) as stopped, mean(zombies) as zombies, mean(paging) as paging, mean(unknown) as unknown FROM \"processes\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "blocked" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$server$/" + } + ] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Processes", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:657", + "format": "short", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:658", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 62046, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 3 + }, + "height": "400", + "hiddenSeries": false, + "id": 12054, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/total/", + "color": "#BF1B00", + "fill": 0, + "linewidth": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "mem_inactive", + "policy": "default", + "query": "SELECT mean(total) as total, mean(used) as used, mean(cached) as cached, mean(free) as free, mean(buffered) as buffered FROM \"mem\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:737", + "format": "bytes", + "logBase": 1, + "min": 0, + "show": true + }, + { + "$$hashKey": "object:738", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Memory", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 62047, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 14 + }, + "height": "", + "hiddenSeries": false, + "id": 61855, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "processes", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(context_switches),1s)as \"context switches\" FROM \"kernel\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "blocked" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$server$/" + } + ] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Context switches", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:837", + "format": "ops", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:838", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 14 + }, + "height": "", + "hiddenSeries": false, + "id": 61960, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "kernel", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(processes_forked),1s) as forks FROM \"kernel\" WHERE \"host\" =~ /^$server$/ AND $timeFilter GROUP BY time($interval), host fill(null)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "processes_forked" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$server$/" + } + ] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Forks", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:913", + "format": "ops", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:914", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 14 + }, + "height": "", + "hiddenSeries": false, + "id": 62042, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/max/", + "color": "#890F02", + "fill": 0 + }, + { + "alias": "/opened/", + "color": "#0A437C" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "kernel", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"file-max\") as max FROM \"linux_sysctl_fs\" WHERE \"host\" =~ /^$server$/ AND $timeFilter GROUP BY time($interval), host fill(null)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "processes_forked" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$server$/" + } + ] + }, + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "kernel", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"file-nr\") as opened FROM \"linux_sysctl_fs\" WHERE \"host\" =~ /^$server$/ AND $timeFilter GROUP BY time($interval), host fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "processes_forked" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$server$/" + } + ] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "File descriptors", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:989", + "format": "short", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:990", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Kernel", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 62048, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 22 + }, + "hiddenSeries": false, + "id": 62043, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $tag_irq", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "irq" + ], + "type": "tag" + }, + { + "params": [ + "host" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "interrupts", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "total" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$server$/" + } + ] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Interrupts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1069", + "format": "ops", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:1070", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Interrupts", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 62049, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 30 + }, + "height": "", + "hiddenSeries": false, + "id": 61868, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 4, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "cpu", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $cpu $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "hide": false, + "measurement": "cpu_percentageBusy", + "policy": "default", + "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "CPU usage for $cpu", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1795", + "format": "percent", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "$$hashKey": "object:1796", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Per-cpu usage", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 62053, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 31 + }, + "hiddenSeries": false, + "id": 42026, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/ in$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "derivative", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "interface", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "net_bytes_recv", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(bytes_recv),1s)*8 as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "derivative", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "interface", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "net_bytes_recv", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(bytes_sent),1s)*8 as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Network Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:250", + "format": "bps", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:251", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 31 + }, + "hiddenSeries": false, + "id": 28572, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/ in$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "derivative", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "interface", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "net_bytes_recv", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(packets_recv), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "derivative", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "interface", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "net_bytes_recv", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(packets_sent), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Network Packets", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2032", + "format": "pps", + "label": "", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2033", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "decimals": 1, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 38 + }, + "hiddenSeries": false, + "id": 58901, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "derivative", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "interface", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "net_bytes_recv", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(drop_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "derivative", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "interface", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "net_bytes_recv", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(drop_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Network drops", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2105", + "format": "pps", + "label": "Drops per second", + "logBase": 1, + "min": 0, + "show": true + }, + { + "$$hashKey": "object:2106", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 38 + }, + "hiddenSeries": false, + "id": 50643, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "derivative", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "interface", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "net_bytes_recv", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(err_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "derivative", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "interface", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "net_bytes_recv", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(err_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Network errors", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2178", + "format": "short", + "label": "Errors per second", + "logBase": 1, + "min": 0, + "show": true + }, + { + "$$hashKey": "object:2179", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "repeat": "netif", + "title": "Network interface stats for $netif", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 43 + }, + "id": 62054, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 51 + }, + "hiddenSeries": false, + "id": 26024, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/in/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "swap_in", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(\"in\")) as \"in\", non_negative_derivative(mean(\"out\")) as \"out\" FROM \"swap\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Swap I/O bytes", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2454", + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2455", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 51 + }, + "hiddenSeries": false, + "id": 61850, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/total/", + "color": "#890F02", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "swap_in", + "policy": "default", + "query": "SELECT mean(used) as \"used\", mean(total) as \"total\" FROM \"swap\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Swap usage (bytes)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2530", + "format": "bytes", + "logBase": 1, + "min": 0, + "show": true + }, + { + "$$hashKey": "object:2531", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Swap", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 62055, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 59 + }, + "hiddenSeries": false, + "id": 13782, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*write$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $tag_name: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "path", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "io_reads", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(reads),1s) as \"read\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "alias": "$tag_host: $tag_name: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "path", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "io_reads", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(writes),1s) as \"write\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Disk I/O requests for /dev/$disk", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2618", + "format": "iops", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2619", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 59 + }, + "hiddenSeries": false, + "id": 60200, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*write$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $tag_name: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "path", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "io_reads", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(read_bytes),1s) as \"read\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "alias": "$tag_host: $tag_name: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "path", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "io_reads", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(write_bytes),1s) as \"write\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Disk I/O bytes for /dev/$disk", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2694", + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2695", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 59 + }, + "hiddenSeries": false, + "id": 56720, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*write$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $tag_name: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "path", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "io_reads", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(read_time),1s) as \"read\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "alias": "$tag_host: $tag_name: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "path", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "io_reads", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(write_time),1s) as \"write\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Disk I/O time for /dev/$disk", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2770", + "format": "ms", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2771", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "repeat": "disk", + "title": "Disk IOPS for /dev/$disk", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 49 + }, + "id": 62056, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 85 + }, + "hiddenSeries": false, + "id": 52240, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 4, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/total/", + "color": "#BF1B00", + "fill": 0, + "linewidth": 2, + "zindex": 3 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "alias": "$tag_host: mountpoint $tag_path - $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "path", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "disk_total", + "policy": "default", + "query": "SELECT mean(total) AS \"total\", mean(used) as \"used\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Disk usage for $mountpoint", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2952", + "format": "bytes", + "logBase": 1, + "min": 0, + "show": true + }, + { + "$$hashKey": "object:2953", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 85 + }, + "hiddenSeries": false, + "id": 33458, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "maxPerRow": 4, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/used/", + "color": "#447EBC", + "zindex": 3 + }, + { + "alias": "/total/", + "bars": false, + "color": "#BF1B00", + "fill": 0, + "lines": true, + "linewidth": 1 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "alias": "$tag_host: mountpoint $tag_path - $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "path", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "disk_inodes_free", + "policy": "default", + "query": "SELECT mean(inodes_used) as \"used\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/ AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "alias": "$tag_host: mountpoint $tag_path - $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "path", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "disk_inodes_free", + "policy": "default", + "query": "SELECT mean(inodes_free) + mean(inodes_used) as \"total\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/ AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Disk inodes for $mountpoint", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:3025", + "format": "short", + "logBase": 1, + "min": 0, + "show": true + }, + { + "$$hashKey": "object:3026", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "repeat": "mountpoint", + "title": "Disk space usage for $mountpoint", + "type": "row" + } + ], + "refresh": false, + "schemaVersion": 36, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allFormat": "glob", + "current": { + "selected": false, + "text": "InfluxDB", + "value": "InfluxDB" + }, + "datasource": "InfluxDB telegraf", + "hide": 0, + "includeAll": false, + "label": "", + "multi": false, + "name": "datasource", + "options": [], + "query": "influxdb", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "auto": true, + "auto_count": 100, + "auto_min": "30s", + "current": { + "selected": false, + "text": "10s", + "value": "10s" + }, + "hide": 0, + "includeAll": false, + "label": "Sampling", + "multi": false, + "name": "inter", + "options": [ + { + "selected": false, + "text": "auto", + "value": "$__auto_interval_inter" + }, + { + "selected": true, + "text": "10s", + "value": "10s" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "2m", + "value": "2m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + } + ], + "query": "10s,30s,1m,2m,5m,10m,30m,1h", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "influxdb", + "uid": "$datasource" + }, + "definition": "select n_cpus from system", + "hide": 0, + "includeAll": true, + "label": "Clusters (ncores)", + "multi": true, + "name": "ncores", + "options": [], + "query": "select n_cpus from system", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 3, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "influxdb", + "uid": "$datasource" + }, + "definition": "select host, n_cpus from system where n_cpus = $ncores group by \"host\"", + "hide": 0, + "includeAll": true, + "label": "Cluster nodes", + "multi": true, + "name": "server", + "options": [], + "query": "select host, n_cpus from system where n_cpus = $ncores group by \"host\"", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "influxdb", + "uid": "$datasource" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Mountpoint", + "multi": true, + "name": "mountpoint", + "options": [], + "query": "SHOW TAG VALUES FROM \"disk\" WITH KEY = \"path\" WHERE host =~ /$server/", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "influxdb", + "uid": "$datasource" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": "CPU", + "multi": true, + "name": "cpu", + "options": [], + "query": "SHOW TAG VALUES FROM \"cpu\" WITH KEY = \"cpu\" WHERE host =~ /$server/", + "refresh": 1, + "regex": "/cpu[0-9]/", + "skipUrlSync": false, + "sort": 1, + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "influxdb", + "uid": "$datasource" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Disk", + "multi": true, + "name": "disk", + "options": [], + "query": "SHOW TAG VALUES FROM \"diskio\" WITH KEY = \"name\" WHERE host =~ /$server/", + "refresh": 1, + "regex": "[a-z]d[\\D]$|nvme[\\d]n[\\d]$", + "skipUrlSync": false, + "sort": 0, + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "influxdb", + "uid": "$datasource" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Network interface", + "multi": true, + "name": "netif", + "options": [], + "query": "SHOW TAG VALUES FROM \"net\" WITH KEY = \"interface\" WHERE host =~ /$server/", + "refresh": 1, + "regex": "^(?!.*veth|all|tap).*$", + "skipUrlSync": false, + "sort": 1, + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "definition": "SHOW TAG VALUES FROM amd_rocm_smi with KEY=\"uuid\" where host =~ /$server/", + "hide": 0, + "includeAll": true, + "label": "GPU", + "multi": true, + "name": "gpu", + "options": [], + "query": "SHOW TAG VALUES FROM amd_rocm_smi with KEY=\"uuid\" where host =~ /$server/", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Cluster Dashboard", + "uid": "00000012722", + "version": 50, + "weekStart": "" +} \ No newline at end of file diff --git a/playbooks/roles/grafana/files/cluster_prometheus.json b/playbooks/roles/grafana/files/cluster_prometheus.json new file mode 100644 index 00000000..ce7e4dde --- /dev/null +++ b/playbooks/roles/grafana/files/cluster_prometheus.json @@ -0,0 +1,1133 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.1.3" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 10, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 9, + "interval": "15s", + "maxPerRow": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "repeat": "interfaces", + "repeatDirection": "h", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "deriv(net_bytes_recv{interface=~\"$interfaces\", host=~\"$host\"}[5m])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:recv", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "deriv(net_bytes_sent{interface=~\"$interfaces\", host=~\"$host\"}[5m])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:sent", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Network for $interfaces", + "type": "timeseries" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 5, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "cpu_usage_system{host=~\"$host\", cpu=\"cpu-total\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:system", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "cpu_usage_user{host=~\"$host\", cpu=\"cpu-total\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:user", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "cpu_usage_steal{host=~\"$host\", cpu=\"cpu-total\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:steal", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "cpu_usage_irq{host=~\"$host\", cpu=\"cpu-total\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:irq", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "cpu_usage_iowait{host=~\"$host\", cpu=\"cpu-total\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:iowait", + "range": true, + "refId": "F", + "useBackend": false + } + ], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "mem_free{host=~\"$host\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:free", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "mem_used{host=~\"$host\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:used", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "mem_total{host=~\"$host\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:total", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "Memory Usage", + "type": "timeseries" + } + ], + "title": "CPU", + "type": "row" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 6, + "panels": [], + "title": "GPU", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "nvidia_smi_utilization_gpu{host=~\"$host\", uuid=~\"$gpu\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:{{uuid}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 27 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "nvidia_smi_temperature_gpu{host=~\"$host\", uuid=~\"$gpu\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:{{uuid}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Temperature", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "mbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 27 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "nvidia_smi_memory_used{host=~\"$host\", uuid=~\"$gpu\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:{{uuid}}:used", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "nvidia_smi_memory_total{host=~\"$host\", uuid=~\"$gpu\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:{{uuid}}:total", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "GPU Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 27 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "nvidia_smi_power_draw{host=~\"$host\", uuid=~\"$gpu\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:{{uuid}}:used", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Power Draw", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "nvidia_smi_remapped_rows_uncorrectable{host=~\"$host\", uuid=~\"$gpu\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:{{uuid}}:uncorrectable", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "nvidia_smi_remapped_rows_correctable{host=~\"$host\", uuid=~\"$gpu\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{host}}:{{uuid}}:correctable", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Row Remap", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(interface)", + "hide": 0, + "includeAll": false, + "multi": true, + "name": "interfaces", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(interface)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(host)", + "hide": 0, + "includeAll": false, + "multi": true, + "name": "host", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(host)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(uuid)", + "hide": 0, + "includeAll": false, + "multi": true, + "name": "gpu", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(uuid)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "HPC Cluster", + "uid": "fdtz1zfpsn9j4c", + "version": 12, + "weekStart": "" +} \ No newline at end of file diff --git a/playbooks/roles/grafana/files/cluster_prometheus_v2.json b/playbooks/roles/grafana/files/cluster_prometheus_v2.json new file mode 100644 index 00000000..a74d0fc8 --- /dev/null +++ b/playbooks/roles/grafana/files/cluster_prometheus_v2.json @@ -0,0 +1,757 @@ +{ + "description": "Dashboard - covers host, GPU, RDMA and NVLink metrics\n", + "graphTooltip": 1, + "panels": [ + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "avg by(Hostname) (DCGM_FI_DEV_GPU_UTIL)", + "legendFormat": "{{ Hostname }}" + } + ], + "title": "GPU Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "hertz" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 3, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "avg by(Hostname) (DCGM_FI_DEV_SM_CLOCK)", + "legendFormat": "{{ Hostname }}" + } + ], + "title": "SM Clock", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "hertz" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "avg by(Hostname) (DCGM_FI_DEV_MEM_CLOCK)", + "legendFormat": "{{ Hostname }}" + } + ], + "title": "Memory Clock", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 5, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "avg by(Hostname) (DCGM_FI_DEV_FB_USED)", + "legendFormat": "{{ Hostname }}" + } + ], + "title": "Frame Buffer Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 6, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "avg by(Hostname) (DCGM_FI_DEV_MEM_COPY_UTIL)", + "legendFormat": "{{ Hostname }}" + } + ], + "title": "Memory Copy Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "watt" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 7, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "avg by(Hostname) (DCGM_FI_DEV_POWER_USAGE)", + "legendFormat": "{{ Hostname }}" + } + ], + "title": "Power Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 8, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "avg by(Hostname) (DCGM_FI_DEV_ENC_UTIL)", + "legendFormat": "{{ Hostname }}" + } + ], + "title": "Encoder Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 9, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "avg by(Hostname) (DCGM_FI_DEV_DEC_UTIL)", + "legendFormat": "{{ Hostname }}" + } + ], + "title": "Decoder Utilization", + "type": "timeseries" + } + ], + "title": "GPU", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 10, + "panels": [ + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 11, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "(rdma_np_ecn_marked_roce_packets)", + "legendFormat": "{{ interface }}" + } + ], + "title": "Number of ROCEv2 packets marked for congestion", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 12, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "(rdma_out_of_sequence)", + "legendFormat": "{{ interface }}" + } + ], + "title": "Number of out of sequence packets received", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 13, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "(rdma_packet_seq_err)", + "legendFormat": "{{ interface }}" + } + ], + "title": "Number of received NAK sequence error packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 14, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "(rdma_local_ack_timeout_err)", + "legendFormat": "{{ interface }}" + } + ], + "title": "Number of times QPs ack timer expired", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 15, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "(rdma_roce_adp_retrans)", + "legendFormat": "{{ interface }}" + } + ], + "title": "Number of adaptive retransmissions for RoCE traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 16, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "(rdma_np_cnp_sent)", + "legendFormat": "{{ interface }}" + } + ], + "title": "Number of CNP packets sent", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 17, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "(rdma_rp_cnp_handled)", + "legendFormat": "{{ interface }}" + } + ], + "title": "Number of CNP packets handled to throttle", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 18, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "(rdma_rp_cnp_ignored)", + "legendFormat": "{{ interface }}" + } + ], + "title": "Number of CNP packets received and ignored", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 19, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "(rdma_rx_icrc_encapsulated)", + "legendFormat": "{{ interface }}" + } + ], + "title": "Number of RoCE packets with ICRC (Invertible Cyclic Redundancy Check) errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 20, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "(rdma_roce_slow_restart)", + "legendFormat": "{{ interface }}" + } + ], + "title": "Number of times RoCE slow restart was used", + "type": "timeseries" + } + ], + "title": "RDMA", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 21, + "panels": [ + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "KB" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 22, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "sum by(gpu) (nvlink_data_tx_kib)", + "legendFormat": "{{ gpu }}" + } + ], + "title": "Total data in KiB transmitted", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "KB" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 23, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "sum by(gpu) (nvlink_data_rx_kib)", + "legendFormat": "{{ gpu }}" + } + ], + "title": "Total data in KiB received", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "KB" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 24, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "sum by(gpu) (nvlink_raw_tx_kib)", + "legendFormat": "{{ gpu }}" + } + ], + "title": "Total raw bytes in KiB transmitted", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "KB" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 25, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "expr": "sum by(gpu) (nvlink_raw_rx_kib)", + "legendFormat": "{{ gpu }}" + } + ], + "title": "Total raw bytes in KiB received", + "type": "timeseries" + } + ], + "title": "NVLink", + "type": "row" + } + ], + "schemaVersion": 39, + "templating": { + "list": [ + { + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "name": "hostname", + "query": "label_values(Hostname)", + "refresh": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "browser", + "title": "GPU RDMA NVLink Dashboard", + "uid": "cluster-dashboard" +} diff --git a/playbooks/roles/grafana/files/g.libsonnet b/playbooks/roles/grafana/files/g.libsonnet new file mode 100644 index 00000000..2c1ed8e8 --- /dev/null +++ b/playbooks/roles/grafana/files/g.libsonnet @@ -0,0 +1 @@ +import 'grafonnet-latest/main.libsonnet' diff --git a/playbooks/roles/grafana/files/main.jsonnet b/playbooks/roles/grafana/files/main.jsonnet new file mode 100644 index 00000000..591def11 --- /dev/null +++ b/playbooks/roles/grafana/files/main.jsonnet @@ -0,0 +1,338 @@ +local g = import './g.libsonnet'; +local variables = import './variables.libsonnet'; +local row = g.panel.row; + +local critical_status_ts = [ +{ expr: 'count_values("hostname", oca_version) by(version)', legend_format: '{{version}}', title: 'Hosts with OCA version', unit: 'none' }, +{ expr: 'count_values("hostname", gpu_count) by (instance_shape)', legend_format: '{{instance_shape}}', title: 'Shapes with matching GPU count', unit: 'none' }, +{ expr: 'count_values("hostname", up{instance=~".*9100"})', legend_format: '{{hostname}}', title: 'Hosts up', unit: 'none' }, +{ expr: 'check_bus_issue_count{hostname=~"$hostname", oci_name=~"$oci_name"}', legend_format: '{{ hostname }}', title: 'Devices fallen off bus error count', unit: 'none' }, +{ expr: 'DCGM_FI_DEV_XID_ERRORS{Hostname=~"$hostname", gpu=~"$gpu", oci_name=~"$oci_name"}', legend_format: '{{ Hostname }}:{{ gpu }}', title: 'Value of the last XID error encountered', unit: 'none' }, +]; + +local critical_status_stl = [ +{ expr1: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{rdma_device}}', title: 'RDMA Device Status', unit: 'none', colors: {'0': { text: 'down', color: 'red' },'1': { text: 'up', color: 'green' },} }, +{ expr1: 'gpu_row_remap_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'gpu_row_remap_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}', title: 'GPU Row Remap Error Check', unit: 'none', colors: {'0': { text: 'passed', color: 'green' },'1': { text: 'failed', color: 'red' },} }, +{ expr1: 'gpu_ecc_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'gpu_ecc_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}', title: 'GPU ECC Error Check', unit: 'none', colors: {'0': { text: 'failed', color: 'red' },'1': { text: 'passed', color: 'green' },} }, +{ expr1: 'xid_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'xid_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}', title: 'Xid Error Check', unit: 'none', colors: {'1': { text: 'passed', color: 'green' },'0': { text: 'failed', color: 'red' },} }, +]; + +local health_status = [ +{ expr1: 'ib_link_state{hostname=~"$hostname", oci_name=~"$oci_name"}==1 or vector(0)', expr2: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"} > 1', legend_format: '{{hostname}}:{{rdma_device}}', title: 'RDMA Link State (h/w metric)', unit: 'none', colors: {'1': { text: 'down', color: 'red' },} }, +{ expr1: 'rdma_link_noflap{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'rdma_link_noflap{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{rdma_device}}', title: 'RDMA Link flapping', unit: 'none', colors: {'0': { text: 'down', color: 'red' },'1': { text: 'up', color: 'green' },} }, +{ expr1: 'rttcc_status{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'rttcc_status{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{rdma_device}}', title: 'RTTCC Status', unit: 'none', colors: {'0': { text: 'disabled', color: 'green' },'1': { text: 'enabled', color: 'red' },} }, +{ expr1: 'gpu_count{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'gpu_count{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{instance_shape}}', title: 'GPU Count', unit: 'none', colors: {'0': { text: 'down', color: 'red' },'1': { text: 'up', color: 'green' },} }, +{ expr1: 'oca_version{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'oca_version{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{version}}', title: 'OCA Version', unit: 'none', colors: {'0': { text: 'down', color: 'red' },'1': { text: 'up', color: 'green' },} }, +]; + +local cluster_metrics = [ +{ expr: 'avg by (cluster_name) (node_load1)', legend_format: '1m load average {{cluster_name}}', title: 'Cluster 1m load average', unit: 'percent' }, +{ expr: 'avg by (cluster_name) (node_load5)', legend_format: '5m load average {{cluster_name}}', title: 'Cluster 5m load average', unit: 'percent' }, +{ expr: 'avg by (cluster_name) (node_load15)', legend_format: '15m load average {{cluster_name}}', title: 'Cluster 15m load average', unit: 'percent' }, +]; + +local node_metrics = [ +{ expr: '(node_load1{hostname=~"$hostname",oci_name=~"$oci_name"})', legend_format: '{{oci_name}} {{hostname}}', title: 'Instance 1m load average', unit: 'percent' }, +{ expr: '(node_load5{hostname=~"$hostname",oci_name=~"$oci_name"})', legend_format: '{{oci_name}} {{hostname}}', title: 'Instance 5m load average', unit: 'percent' }, +{ expr: '(node_load15{hostname=~"$hostname",oci_name=~"$oci_name"})', legend_format: '{{oci_name}} {{hostname}}', title: 'Instance 15m load average', unit: 'percent' }, +{ expr: 'ceil((1 - (node_memory_MemAvailable_bytes{hostname=~"$hostname",oci_name=~"$oci_name"}/node_memory_MemTotal_bytes{hostname=~"$hostname",oci_name=~"$oci_name"}))*100)', legend_format: '{{oci_name}} {{hostname}}', title: 'Memory utilization', unit: 'percent' }, +{ expr: 'ceil((1 - (node_filesystem_avail_bytes{hostname=~"$hostname",oci_name=~"$oci_name",mountpoint=~"$mountpoint",device!~"rootfs"} / node_filesystem_size_bytes{hostname=~"$hostname",oci_name=~"$oci_name",mountpoint=~"$mountpoint",device!~"rootfs"}))*100)', legend_format: '{{oci_name}} {{hostname}} {{mountpoint}}', title: 'Storage utilization', unit: 'percent'}, +{ expr: 'irate(node_disk_reads_completed_total{hostname=~"$hostname",oci_name=~"$oci_name"}[5m])', legend_format: '{{oci_name}} {{hostname}} {{device}}', title: 'Disk reads completed iops', unit: 'iops'}, +{ expr: 'irate(node_disk_writes_completed_total{hostname=~"$hostname",oci_name=~"$oci_name"}[5m])', legend_format: '{{oci_name}} {{hostname}} {{device}}', title: 'Disk writes completed iops', unit: 'iops'}, +{ expr: 'irate(node_disk_read_bytes_total{hostname=~"$hostname",oci_name=~"$oci_name"}[5m])', legend_format: '{{oci_name}} {{hostname}} {{device}}', title: 'Disk read bytes', unit: 'Bps'}, +{ expr: 'irate(node_disk_written_bytes_total{hostname=~"$hostname",oci_name=~"$oci_name"}[5m])', legend_format: '{{oci_name}} {{hostname}} {{device}}', title: 'Disk write bytes', unit: 'Bps'}, +{ expr: 'irate(node_disk_io_time_seconds_total{hostname=~"$hostname",oci_name=~"$oci_name"}[5m])', legend_format: '{{oci_name}} {{hostname}} {{device}}', title: 'Time spent doing I/Os', unit: 'percentunit'}, +{ expr: 'rate(node_network_receive_bytes_total{hostname=~"$hostname",oci_name=~"$oci_name",device=~"$device"}[5m])', legend_format: "{{oci_name}} {{hostname}} {{device}}", title: 'Network Traffic Received', unit: 'Bps'}, +{ expr: 'rate(node_network_transmit_bytes_total{hostname=~"$hostname",oci_name=~"$oci_name",device=~"$device"}[5m])', legend_format: "{{oci_name}} {{hostname}} {{device}}", title: 'Network Traffic Sent', unit: 'Bps'} +]; + +local nfs_metrics = [ +{ expr: 'rate(node_mountstats_nfs_total_read_bytes_total{hostname=~"$hostname", oci_name=~"$oci_name"}[$__range])', legend_format: '{{oci_name}} {{hostname}} {{export}}', title: 'Read Throughput', unit: 'Bps' }, +{ expr: 'rate(node_mountstats_nfs_total_write_bytes_total{hostname=~"$hostname", oci_name=~"$oci_name"}[$__range])', legend_format: '{{oci_name}} {{hostname}} {{export}}', title: 'Write Throughput', unit: 'Bps' }, +{ expr: 'sum by(oci_name, hostname) (rate(node_mountstats_nfs_operations_requests_total{hostname=~"$hostname", oci_name=~"$oci_name", operation!~"READ|WRITE"}[$__range]))', legend_format: '{{oci_name}} {{hostname}}', title: 'Metadata IOPS', unit: 'iops' }, +{ expr: 'sum by(oci_name, hostname) (rate(node_mountstats_nfs_operations_requests_total{hostname=~"$hostname", oci_name=~"$oci_name", operation=~"READ|WRITE"}[$__range]))', legend_format: '{{oci_name}} {{hostname}}', title: 'Read/Write IOPS', unit: 'iops' }, +{ expr: 'sum by(oci_name, hostname, export) (node_nfs_rpc_retransmissions_total{hostname=~"$hostname", oci_name=~"$oci_name"})', legend_format: '{{oci_name}} {{hostname}} {{export}}', title: 'NFS Retransmissions', unit: 'cps' }, +{ expr: 'avg by(oci_name, hostname, export) (rate(node_mountstats_nfs_operations_request_time_seconds_total{hostname=~"$hostname", oci_name=~"$oci_name"}[$__range]))', legend_format: '{{oci_name}} {{hostname}} {{export}}', title: 'NFS Request Time', unit: 's' }, +{ expr: 'avg by(oci_name, hostname, export) (rate(node_mountstats_nfs_operations_response_time_seconds_total{hostname=~"$hostname", oci_name=~"$oci_name"}[$__range]))', legend_format: '{{oci_name}} {{hostname}} {{export}}', title: 'NFS Response Time', unit: 's' }, +{ expr: 'avg by(oci_name, hostname, export) (rate(node_mountstats_nfs_operations_queue_time_seconds_total{hostname=~"$hostname", oci_name=~"$oci_name"}[$__range]))', legend_format: '{{oci_name}} {{hostname}} {{export}}', title: 'NFS Queue Time', unit: 's' }, +]; + +local dcgm_metrics = [ + { name: 'DCGM_FI_DEV_SM_CLOCK', title: 'SM Clock', unit: 'hertz' }, + { name: 'DCGM_FI_DEV_MEM_CLOCK', title: 'Memory Clock', unit: 'hertz' }, + { name: 'DCGM_FI_DEV_MEMORY_TEMP', title: 'Memory temperature (in C)', unit: 'celsius'}, + { name: 'DCGM_FI_DEV_GPU_TEMP', title: 'GPU temperature (in C)', unit: 'celsius' }, + { name: 'DCGM_FI_DEV_POWER_USAGE', title: 'Power draw (in W)', unit: 'watts' }, + { name: 'DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION', title: 'Total energy consumption since boot (in mJ)', unit: 'joule' }, + { name: 'DCGM_FI_DEV_PCIE_REPLAY_COUNTER', title: 'Total number of PCIe retries', unit: 'none' }, + { name: 'DCGM_FI_DEV_GPU_UTIL', title: 'GPU Utilization', unit: 'percent' }, + { name: 'DCGM_FI_DEV_MEM_COPY_UTIL', title: 'Memory Copy Utilization', unit: 'percent' }, + { name: 'DCGM_FI_DEV_ENC_UTIL', title: 'Encoder Utilization', unit: 'percent' }, + { name: 'DCGM_FI_DEV_DEC_UTIL', title: 'Decoder Utilization', unit: 'percent' }, + { name: 'DCGM_FI_DEV_FB_FREE', title: 'Framebuffer memory free (in MiB)', unit: 'megabytes' }, + { name: 'DCGM_FI_DEV_FB_USED', title: 'Framebuffer memory used (in MiB)', unit: 'megabytes' }, + { name: 'DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL', title: 'Total number of NVLink bandwidth counters for all lanes', unit: 'none' }, +]; + +local dcgm_errors = [ + { name: 'DCGM_FI_DEV_ECC_SBE_VOL_TOTAL', title: 'Total number of single-bit volatile ECC errors', unit: 'cps' }, + { name: 'DCGM_FI_DEV_ECC_DBE_VOL_TOTAL', title: 'Total number of double-bit volatile ECC errors', unit: 'cps' }, + { name: 'DCGM_FI_DEV_ECC_SBE_AGG_TOTAL', title: 'Total number of single-bit persistent ECC errors', unit: 'cps' }, + { name: 'DCGM_FI_DEV_ECC_DBE_AGG_TOTAL', title: 'Total number of double-bit persistent ECC errors', unit: 'cps' }, + { name: 'DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS', title: 'Number of remapped rows for uncorrectable errors', unit: 'cps' }, + { name: 'DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS', title: 'Number of remapped rows for correctable errors', unit: 'cps' }, + { name: 'DCGM_FI_DEV_ROW_REMAP_FAILURE', title: 'Whether remapping of rows has failed', unit: 'cps' }, + { name: 'DCGM_FI_DEV_XID_ERRORS', title: 'Value of the last XID error encountered', unit: 'none' }, +]; + +local nvlink_metrics = [ +{ name: 'nvlink_data_tx_kib', title: 'Total data transmitted', unit: 'KBs' }, +{ name: 'nvlink_data_rx_kib', title: 'Total data received', unit: 'KBs' }, +{ name: 'nvlink_raw_tx_kib', title: 'Total raw bytes transmitted', unit: 'KBs' }, +{ name: 'nvlink_raw_rx_kib', title: 'Total raw bytes received', unit: 'KBs' }, +]; + +local ib_port_metrics = [ +{ name: 'ib_port_xmit_data', title: 'Total number of data octets transmitted', unit: 'MiBs' }, +{ name: 'ib_port_rcv_data', title: 'Total number of data octets received', unit: 'MiBs' }, +{ name: 'ib_port_xmit_packets', title: 'Total number of packets transmitted', unit: 'pps' }, +{ name: 'ib_port_rcv_packets', title: 'Total number of packets received', unit: 'pps' }, +{ name: 'ib_unicast_rcv_packets', title: 'Total number of unicast packets received', unit: 'pps' }, +{ name: 'ib_unicast_xmit_packets', title: 'Total number of unicast packets transmitted', unit: 'pps' }, +{ name: 'ib_multicast_rcv_packets', title: 'Total number of multicast packets received', unit: 'pps' }, +{ name: 'ib_multicast_xmit_packets', title: 'Total number of multicast packets transmitted', unit: 'pps' }, +]; + +local roce2_errors = [ +{ name: 'rdma_np_ecn_marked_roce_packets', title: 'Number of ROCEv2 packets marked for congestion', unit: 'none' }, +{ name: 'rdma_out_of_sequence', title: 'Number of out of sequence packets received', unit: 'none' }, +{ name: 'rdma_packet_seq_err', title: 'Number of received NAK sequence error packets', unit: 'none' }, +{ name: 'rdma_local_ack_timeout_err', title: 'Number of times QPs ack timer expired', unit: 'none' }, +{ name: 'rdma_roce_adp_retrans', title: 'Number of adaptive retransmissions for RoCE traffic', unit: 'none' }, +{ name: 'rdma_np_cnp_sent', title: 'Number of CNP packets sent', unit: 'none' }, +{ name: 'rdma_rp_cnp_handled', title: 'Number of CNP packets handled to throttle', unit: 'none' }, +{ name: 'rdma_rp_cnp_ignored', title: 'Number of CNP packets received and ignored', unit: 'none' }, +{ name: 'rdma_rx_icrc_encapsulated', title: 'Number of RoCE packets with ICRC (Invertible Cyclic Redundancy Check) errors', unit: 'none' }, +{ name: 'rdma_roce_slow_restart', title: 'Number of times RoCE slow restart was used', unit: 'none' }, +]; + +g.dashboard.new('Cluster Dashboard') ++ g.dashboard.withUid('cluster-dashboard') ++ g.dashboard.withDescription(||| + Dashboard for GPU Clusters +|||) ++ g.dashboard.withTimezone('browser') ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.prometheus, + variables.availability_domain, + variables.compartment, + variables.rack_id, + variables.rail_id, + variables.hpc_island, + variables.cluster, + variables.oci_name, + variables.hostname, + variables.fss_mount, + variables.mountpoint, + variables.fstype, + variables.device, + variables.interface, + variables.gpu, +]) ++ g.dashboard.withPanels( + g.util.grid.makeGrid([ + row.new('Critical Status') + + row.withCollapsed(true) + + row.withPanels([ + g.panel.timeSeries.new(metric.title) + + g.panel.timeSeries.queryOptions.withTargets([ + g.query.prometheus.new( + '$PROMETHEUS_DS', + metric.expr, + ) + + g.query.prometheus.withLegendFormat(metric.legend_format) + ]) + + g.panel.timeSeries.standardOptions.withUnit(metric.unit) + + g.panel.timeSeries.gridPos.withW(24) + + g.panel.timeSeries.gridPos.withH(8) + for metric in critical_status_ts] + + [g.panel.stateTimeline.new(metric.title) + + g.panel.stateTimeline.queryOptions.withTargets([ + g.query.prometheus.new( + '$PROMETHEUS_DS', + metric.expr1, + ) + + g.query.prometheus.withLegendFormat(metric.legend_format), + g.query.prometheus.new( + '$PROMETHEUS_DS', + metric.expr2, + ) + + g.query.prometheus.withLegendFormat(metric.legend_format), + ]) + + g.panel.stateTimeline.standardOptions.withUnit(metric.unit) + + g.panel.stateTimeline.options.withShowValue('never') + + g.panel.stateTimeline.gridPos.withW(24) + + g.panel.stateTimeline.gridPos.withH(8) + + g.panel.stateTimeline.standardOptions.withMappings( + g.panel.stateTimeline.standardOptions.mapping.ValueMap.withType() + + g.panel.stateTimeline.standardOptions.mapping.ValueMap.withOptions(metric.colors) + ) + for metric in critical_status_stl] + ), + row.new('Health Status') + + row.withCollapsed(true) + + row.withPanels([ + g.panel.stateTimeline.new(metric.title) + + g.panel.stateTimeline.queryOptions.withTargets([ + g.query.prometheus.new( + '$PROMETHEUS_DS', + metric.expr1, + ) + + g.query.prometheus.withLegendFormat(metric.legend_format), + g.query.prometheus.new( + '$PROMETHEUS_DS', + metric.expr2, + ) + + g.query.prometheus.withLegendFormat(metric.legend_format), + ]) + + g.panel.stateTimeline.standardOptions.withUnit(metric.unit) + + g.panel.stateTimeline.options.withShowValue('never') + + g.panel.stateTimeline.gridPos.withW(24) + + g.panel.stateTimeline.gridPos.withH(8) + + g.panel.stateTimeline.standardOptions.withMappings( + g.panel.stateTimeline.standardOptions.mapping.ValueMap.withType() + + g.panel.stateTimeline.standardOptions.mapping.ValueMap.withOptions(metric.colors) + ) + for metric in health_status + ]), + row.new('Cluster Metrics') + + row.withCollapsed(true) + + row.withPanels([ + g.panel.timeSeries.new(metric.title) + + g.panel.timeSeries.queryOptions.withTargets([ + g.query.prometheus.new( + '$PROMETHEUS_DS', + metric.expr, + ) + + g.query.prometheus.withLegendFormat(metric.legend_format) + ]) + + g.panel.timeSeries.standardOptions.withUnit(metric.unit) + + g.panel.timeSeries.gridPos.withW(24) + + g.panel.timeSeries.gridPos.withH(8) + for metric in cluster_metrics + ]), + row.new('Node Metrics') + + row.withCollapsed(true) + + row.withPanels([ + g.panel.timeSeries.new(metric.title) + + g.panel.timeSeries.queryOptions.withTargets([ + g.query.prometheus.new( + '$PROMETHEUS_DS', + metric.expr, + ) + + g.query.prometheus.withLegendFormat(metric.legend_format) + ]) + + g.panel.timeSeries.standardOptions.withUnit(metric.unit) + + g.panel.timeSeries.gridPos.withW(24) + + g.panel.timeSeries.gridPos.withH(8) + for metric in node_metrics + ]), + row.new('NFS Metrics') + + row.withCollapsed(true) + + row.withPanels([ + g.panel.timeSeries.new(metric.title) + + g.panel.timeSeries.queryOptions.withTargets([ + g.query.prometheus.new( + '$PROMETHEUS_DS', + metric.expr, + ) + + g.query.prometheus.withLegendFormat(metric.legend_format) + ]) + + g.panel.timeSeries.standardOptions.withUnit(metric.unit) + + g.panel.timeSeries.gridPos.withW(24) + + g.panel.timeSeries.gridPos.withH(8) + for metric in nfs_metrics + ]), + row.new('GPU Metrics') + + row.withCollapsed(true) + + row.withPanels([ + g.panel.timeSeries.new(metric.title) + + g.panel.timeSeries.queryOptions.withTargets([ + g.query.prometheus.new( + '$PROMETHEUS_DS', + 'avg by(Hostname, gpu) (' + metric.name + '{Hostname=~"$hostname", oci_name=~"$oci_name"})', + ) + + g.query.prometheus.withLegendFormat('{{ Hostname }}:{{ gpu }}') + ]) + + g.panel.timeSeries.standardOptions.withUnit(metric.unit) + + g.panel.timeSeries.gridPos.withW(24) + + g.panel.timeSeries.gridPos.withH(8) + for metric in dcgm_metrics + ]), + row.new('GPU Errors') + + row.withCollapsed(true) + + row.withPanels([ + g.panel.timeSeries.new(metric.title) + + g.panel.timeSeries.queryOptions.withTargets([ + g.query.prometheus.new( + '$PROMETHEUS_DS', + 'avg by(Hostname, gpu) (' + metric.name + '{Hostname=~"$hostname", oci_name=~"$oci_name"})', + ) + + g.query.prometheus.withLegendFormat('{{ Hostname }}:{{ gpu }}') + ]) + + g.panel.timeSeries.standardOptions.withUnit(metric.unit) + + g.panel.timeSeries.gridPos.withW(24) + + g.panel.timeSeries.gridPos.withH(8) + for metric in dcgm_errors + ]), + row.new('NVLink Metrics') + + row.withCollapsed(true) + + row.withPanels([ + g.panel.timeSeries.new(metric.title) + + g.panel.timeSeries.queryOptions.withTargets([ + g.query.prometheus.new( + '$PROMETHEUS_DS', + 'sum by(gpu) (' + metric.name + '{hostname=~"$hostname",oci_name=~"$oci_name", gpu=~"$gpu"})', + ) + + g.query.prometheus.withLegendFormat('{{ gpu }}') + ]) + + g.panel.timeSeries.standardOptions.withUnit(metric.unit) + + g.panel.timeSeries.gridPos.withW(24) + + g.panel.timeSeries.gridPos.withH(8) + for metric in nvlink_metrics + ]), + row.new('ROCEv2 Port Metrics') + + row.withCollapsed(true) + + row.withPanels([ + g.panel.timeSeries.new(metric.title) + + g.panel.timeSeries.queryOptions.withTargets([ + g.query.prometheus.new( + '$PROMETHEUS_DS', + '(' + metric.name + '{hostname=~"$hostname", oci_name=~"$oci_name", interface=~"$interface"})', + ) + + g.query.prometheus.withLegendFormat('{{oci_name}}:{{ hostname }}:{{ interface }}') + ]) + + g.panel.timeSeries.standardOptions.withUnit(metric.unit) + + g.panel.timeSeries.gridPos.withW(24) + + g.panel.timeSeries.gridPos.withH(8) + for metric in ib_port_metrics + ]), + row.new('ROCEv2 Congestion Metrics') + + row.withCollapsed(true) + + row.withPanels([ + g.panel.timeSeries.new(metric.title) + + g.panel.timeSeries.queryOptions.withTargets([ + g.query.prometheus.new( + '$PROMETHEUS_DS', + '(' + metric.name + '{hostname=~"$hostname", oci_name=~"$oci_name", interface=~"$interface"})', + ) + + g.query.prometheus.withLegendFormat('{{oci_name}}:{{ hostname }}:{{ interface }}') + ]) + + g.panel.timeSeries.standardOptions.withUnit(metric.unit) + + g.panel.timeSeries.gridPos.withW(24) + + g.panel.timeSeries.gridPos.withH(8) + for metric in roce2_errors + ]), + ]) +) diff --git a/playbooks/roles/grafana/files/node_exporter.json b/playbooks/roles/grafana/files/node_exporter.json new file mode 100644 index 00000000..20ee2bb1 --- /dev/null +++ b/playbooks/roles/grafana/files/node_exporter.json @@ -0,0 +1,23899 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.4.3" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "$$hashKey": "object:1058", + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 1860, + "graphTooltip": 1, + "id": null, + "links": [ + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "GitHub", + "type": "link", + "url": "https://github.com/rfmoz/grafana-dashboards" + }, + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "Grafana", + "type": "link", + "url": "https://grafana.com/grafana/dashboards/1860" + } + ], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 261, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "Quick CPU / Mem / Disk", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Resource pressure via PSI", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "dark-yellow", + "value": 70 + }, + { + "color": "dark-red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 323, + "links": [], + "options": { + "displayMode": "basic", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "text": {} + }, + "pluginVersion": "9.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "CPU", + "range": false, + "refId": "CPU some", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "Mem", + "range": false, + "refId": "Memory some", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "I/O", + "range": false, + "refId": "I/O some", + "step": 240 + } + ], + "title": "Pressure", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Busy state of all CPU cores together", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 20, + "links": [], + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=\"$node\"}[$__rate_interval])))", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "CPU Busy", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "System load over all CPU cores together", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 155, + "links": [], + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "scalar(node_load1{instance=\"$node\",job=\"$job\"}) * 100 / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Sys Load", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Non available RAM memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 1 + }, + "hideTimeOverride": false, + "id": 16, + "links": [], + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "((node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\", job=\"$job\"}) / node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"}) * 100", + "format": "time_series", + "hide": true, + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "(1 - (node_memory_MemAvailable_bytes{instance=\"$node\", job=\"$job\"} / node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"})) * 100", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "B", + "step": 240 + } + ], + "title": "RAM Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Used Swap", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 10 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 25 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 21, + "links": [], + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "((node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"})) * 100", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "SWAP Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Used Root FS", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 154, + "links": [], + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Root FS Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Total number of CPU cores", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 1 + }, + "id": 14, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "CPU Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "System uptime", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 20, + "y": 1 + }, + "hideTimeOverride": true, + "id": 15, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_time_seconds{instance=\"$node\",job=\"$job\"} - node_boot_time_seconds{instance=\"$node\",job=\"$job\"}", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Total RootFS", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 70 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 3 + }, + "id": 23, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "RootFS Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Total RAM", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 20, + "y": 3 + }, + "id": 75, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "RAM Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Total SWAP", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 22, + "y": 3 + }, + "id": 18, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"}", + "instant": true, + "intervalFactor": 1, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "SWAP Total", + "type": "stat" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 263, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "Basic CPU / Mem / Net / Disk", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Basic CPU info", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "percent" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Busy Iowait" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy Iowait" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy System" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy User" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy Other" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 77, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "width": 250 + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"system\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Busy System", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Busy User", + "range": true, + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Busy Iowait", + "range": true, + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=~\".*irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Busy IRQs", + "range": true, + "refId": "D", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq'}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Busy Other", + "range": true, + "refId": "E", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Idle", + "range": true, + "refId": "F", + "step": 240 + } + ], + "title": "CPU Basic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Basic memory usage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "SWAP Used" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap Used" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM Cache + Buffer" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Available" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#DEDAF7", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 78, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "RAM Total", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "RAM Used", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "RAM Cache + Buffer", + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "RAM Free", + "refId": "D", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "SWAP Used", + "refId": "E", + "step": 240 + } + ], + "title": "Memory Basic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Basic network info per interface", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Recv_bytes_eth2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Recv_bytes_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Recv_drop_eth2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Recv_drop_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Recv_errs_eth2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Recv_errs_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CCA300", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Trans_bytes_eth2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Trans_bytes_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Trans_drop_eth2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Trans_drop_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Trans_errs_eth2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Trans_errs_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CCA300", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "recv_bytes_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "recv_drop_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "recv_drop_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#967302", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "recv_errs_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "recv_errs_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "trans_bytes_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "trans_bytes_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "trans_drop_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "trans_drop_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#967302", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "trans_errs_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "trans_errs_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 74, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "recv {{device}}", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "trans {{device}} ", + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Basic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Disk space used of all filesystems mounted", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 152, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{mountpoint}}", + "refId": "A", + "step": 240 + } + ], + "title": "Disk Space Used Basic", + "type": "timeseries" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 265, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "percentage", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 70, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "percent" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Idle - Waiting for something to happen" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Iowait - Waiting for I/O to complete" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Irq - Servicing interrupts" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Nice - Niced processes executing in user mode" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Softirq - Servicing softirqs" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Steal - Time spent in other operating systems when running in a virtualized environment" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCE2DE", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "System - Processes executing in kernel mode" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "User - Normal processes executing in user mode" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#5195CE", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 3, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 250 + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"system\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "System - Processes executing in kernel mode", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "User - Normal processes executing in user mode", + "range": true, + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"nice\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nice - Niced processes executing in user mode", + "range": true, + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by(instance) (irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Iowait - Waiting for I/O to complete", + "range": true, + "refId": "E", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Irq - Servicing interrupts", + "range": true, + "refId": "F", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"softirq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Softirq - Servicing softirqs", + "range": true, + "refId": "G", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"steal\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Steal - Time spent in other operating systems when running in a virtualized environment", + "range": true, + "refId": "H", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Idle - Waiting for something to happen", + "range": true, + "refId": "J", + "step": 240 + } + ], + "title": "CPU", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap - Swap memory usage" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused - Free memory unassigned" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Hardware Corrupted - *./" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 24, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"} - node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Apps - Memory used by user-space applications", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "PageTables - Memory used to map between virtual and physical memory addresses", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "SwapCache - Memory that keeps track of pages that have been fetched from swap but not yet been modified", + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Slab - Memory used by the kernel to cache data structures for its own use (caches like inode, dentry, etc)", + "refId": "D", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Cache - Parked file data (file content) cache", + "refId": "E", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Buffers - Block device (e.g. harddisk) cache", + "refId": "F", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Unused - Free memory unassigned", + "refId": "G", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Swap - Swap space used", + "refId": "H", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_HardwareCorrupted_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working", + "refId": "I", + "step": 240 + } + ], + "title": "Memory Stack", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bits out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "receive_packets_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "receive_packets_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "transmit_packets_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "transmit_packets_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 84, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit", + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 156, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} - node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{mountpoint}}", + "refId": "A", + "step": 240 + } + ], + "title": "Disk Space Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "IO read (-) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "iops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 45 + }, + "id": 229, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", + "intervalFactor": 4, + "legendFormat": "{{device}} - Reads completed", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", + "intervalFactor": 1, + "legendFormat": "{{device}} - Writes completed", + "refId": "B", + "step": 240 + } + ], + "title": "Disk IOps", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes read (-) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "io time" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*read*./" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byType", + "options": "time" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "hidden" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 45 + }, + "id": 42, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{device}} - Successfully read bytes", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{device}} - Successfully written bytes", + "refId": "B", + "step": 240 + } + ], + "title": "I/O Usage Read / Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "%util", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "io time" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byType", + "options": "time" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "hidden" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 57 + }, + "id": 127, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"} [$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}}", + "refId": "A", + "step": 240 + } + ], + "title": "I/O Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "percentage", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 70, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^Guest - /" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#5195ce", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/^GuestNice - /" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#c15c17", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 57 + }, + "id": 319, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by(instance) (irate(node_cpu_guest_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[1m])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[1m])))", + "hide": false, + "legendFormat": "Guest - Time spent running a virtual CPU for a guest operating system", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by(instance) (irate(node_cpu_guest_seconds_total{instance=\"$node\",job=\"$job\", mode=\"nice\"}[1m])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[1m])))", + "hide": false, + "legendFormat": "GuestNice - Time spent running a niced guest (virtual CPU for guest operating system)", + "range": true, + "refId": "B" + } + ], + "title": "CPU spent seconds in guests (VMs)", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "CPU / Memory / Net / Disk", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 266, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 54 + }, + "id": 136, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Inactive - Memory which has been less recently used. It is more eligible to be reclaimed for other purposes", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Active - Memory that has been used more recently and usually not reclaimed unless absolutely necessary", + "refId": "B", + "step": 240 + } + ], + "title": "Memory Active / Inactive", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*CommitLimit - *./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 54 + }, + "id": 135, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Committed_AS_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Committed_AS - Amount of memory presently allocated on the system", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_CommitLimit_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "CommitLimit - Amount of memory currently available to be allocated on the system", + "refId": "B", + "step": 240 + } + ], + "title": "Memory Committed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 64 + }, + "id": 191, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Inactive_file_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Inactive_file - File-backed memory on inactive LRU list", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Inactive_anon_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Inactive_anon - Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Active_file_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Active_file - File-backed memory on active LRU list", + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Active_anon_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Active_anon - Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs", + "refId": "D", + "step": 240 + } + ], + "title": "Memory Active / Inactive Detail", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#58140C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Dirty" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#B7DBAB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mapped" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM + Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "VmallocUsed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 64 + }, + "id": 130, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Writeback_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writeback - Memory which is actively being written back to disk", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_WritebackTmp_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "WritebackTmp - Memory used by FUSE for temporary writeback buffers", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Dirty_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Dirty - Memory which is waiting to get written back to the disk", + "refId": "C", + "step": 240 + } + ], + "title": "Memory Writeback and Dirty", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 74 + }, + "id": 138, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Mapped_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Mapped - Used memory in mapped pages files which have been mapped, such as libraries", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Shmem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Shmem - Used shared memory (shared between several processes, thus including RAM disks)", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_ShmemHugePages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages", + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_ShmemPmdMapped_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "ShmemPmdMapped - Amount of shared (shmem/tmpfs) memory backed by huge pages", + "refId": "D", + "step": 240 + } + ], + "title": "Memory Shared and Mapped", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#58140C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Dirty" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#B7DBAB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mapped" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM + Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "VmallocUsed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 74 + }, + "id": 131, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_SUnreclaim_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "SUnreclaim - Part of Slab, that cannot be reclaimed on memory pressure", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "SReclaimable - Part of Slab, that might be reclaimed, such as caches", + "refId": "B", + "step": 240 + } + ], + "title": "Memory Slab", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#58140C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Dirty" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#B7DBAB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mapped" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM + Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "VmallocUsed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 84 + }, + "id": 70, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_VmallocChunk_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "VmallocChunk - Largest contiguous block of vmalloc area which is free", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_VmallocTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "VmallocTotal - Total size of vmalloc memory area", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_VmallocUsed_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "VmallocUsed - Amount of vmalloc area which is used", + "refId": "C", + "step": 240 + } + ], + "title": "Memory Vmalloc", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 84 + }, + "id": 159, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Bounce_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Bounce - Memory used for block device bounce buffers", + "refId": "A", + "step": 240 + } + ], + "title": "Memory Bounce", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#58140C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Dirty" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#B7DBAB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mapped" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM + Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "VmallocUsed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Inactive *./" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 94 + }, + "id": 129, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_AnonHugePages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "AnonHugePages - Memory in anonymous huge pages", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_AnonPages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "AnonPages - Memory in user pages not backed by files", + "refId": "B", + "step": 240 + } + ], + "title": "Memory Anonymous", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 94 + }, + "id": 160, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_KernelStack_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "KernelStack - Kernel memory stack. This is not reclaimable", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Percpu_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "PerCPU - Per CPU memory allocated dynamically by loadable modules", + "refId": "B", + "step": 240 + } + ], + "title": "Memory Kernel / CPU", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "pages", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#58140C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Dirty" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#B7DBAB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mapped" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM + Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "VmallocUsed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 104 + }, + "id": 140, + "links": [], + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_HugePages_Free{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "HugePages_Free - Huge pages in the pool that are not yet allocated", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_HugePages_Rsvd{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "HugePages_Rsvd - Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_HugePages_Surp{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "HugePages_Surp - Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages", + "refId": "C", + "step": 240 + } + ], + "title": "Memory HugePages Counter", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#58140C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Dirty" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#B7DBAB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mapped" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM + Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "VmallocUsed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 104 + }, + "id": 71, + "links": [], + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_HugePages_Total{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "HugePages - Total size of the pool of huge pages", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Hugepagesize - Huge Page size", + "refId": "B", + "step": 240 + } + ], + "title": "Memory HugePages Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#58140C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Dirty" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#B7DBAB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mapped" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM + Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "VmallocUsed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 114 + }, + "id": 128, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_DirectMap1G_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "DirectMap1G - Amount of pages mapped as this size", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_DirectMap2M_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "DirectMap2M - Amount of pages mapped as this size", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_DirectMap4k_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "DirectMap4K - Amount of pages mapped as this size", + "refId": "C", + "step": 240 + } + ], + "title": "Memory DirectMap", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 114 + }, + "id": 137, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Unevictable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Unevictable - Amount of unevictable memory that can't be swapped out for a variety of reasons", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Mlocked_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MLocked - Size of pages locked to memory using the mlock() system call", + "refId": "B", + "step": 240 + } + ], + "title": "Memory Unevictable and MLocked", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#58140C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Dirty" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#B7DBAB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mapped" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM + Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "VmallocUsed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 124 + }, + "id": 132, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_NFS_Unstable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "NFS Unstable - Memory in NFS pages sent to the server, but not yet committed to the storage", + "refId": "A", + "step": 240 + } + ], + "title": "Memory NFS", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "Memory Meminfo", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 267, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "pages out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 176, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_vmstat_pgpgin{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pagesin - Page in operations", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_vmstat_pgpgout{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pagesout - Page out operations", + "refId": "B", + "step": 240 + } + ], + "title": "Memory Pages In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "pages out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 22, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_vmstat_pswpin{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pswpin - Pages swapped in", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_vmstat_pswpout{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pswpout - Pages swapped out", + "refId": "B", + "step": 240 + } + ], + "title": "Memory Pages Swap In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "faults", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Pgfault - Page major and minor fault operations" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 51 + }, + "id": 175, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pgfault - Page major and minor fault operations", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pgmajfault - Major page fault operations", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval]) - irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pgminfault - Minor page fault operations", + "refId": "C", + "step": 240 + } + ], + "title": "Memory Page Faults", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#58140C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Dirty" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#B7DBAB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mapped" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM + Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "VmallocUsed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 307, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "oom killer invocations ", + "refId": "A", + "step": 240 + } + ], + "title": "OOM Killer", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "Memory Vmstat", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 293, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Variation*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 260, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_timex_estimated_error_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Estimated error in seconds", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_timex_offset_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Time offset in between local system and reference clock", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_timex_maxerror_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Maximum error in seconds", + "refId": "C", + "step": 240 + } + ], + "title": "Time Synchronized Drift", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 291, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_timex_loop_time_constant{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Phase-locked loop time adjust", + "refId": "A", + "step": 240 + } + ], + "title": "Time PLL Adjust", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Variation*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 168, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_timex_sync_status{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Is clock synchronized to a reliable server (1 = yes, 0 = no)", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_timex_frequency_adjustment_ratio{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Local clock frequency adjustment", + "refId": "B", + "step": 240 + } + ], + "title": "Time Synchronized Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 294, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_timex_tick_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Seconds between clock ticks", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_timex_tai_offset_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "International Atomic Time (TAI) offset", + "refId": "B", + "step": 240 + } + ], + "title": "Time Misc", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "System Timesync", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 312, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 73 + }, + "id": 62, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_procs_blocked{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Processes blocked waiting for I/O to complete", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_procs_running{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Processes in runnable state", + "refId": "B", + "step": 240 + } + ], + "title": "Processes Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Enable with --collector.processes argument on node-exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 73 + }, + "id": 315, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_processes_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ state }}", + "refId": "A", + "step": 240 + } + ], + "title": "Processes State", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "forks / sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 83 + }, + "id": 148, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_forks_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Processes forks second", + "refId": "A", + "step": 240 + } + ], + "title": "Processes Forks", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max.*/" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 83 + }, + "id": 149, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Processes virtual memory size in bytes", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "process_resident_memory_max_bytes{instance=\"$node\",job=\"$job\"}", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Maximum amount of virtual memory available in bytes", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Processes virtual memory size in bytes", + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(process_virtual_memory_max_bytes{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Maximum amount of virtual memory available in bytes", + "refId": "D", + "step": 240 + } + ], + "title": "Processes Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Enable with --collector.processes argument on node-exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "PIDs limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 93 + }, + "id": 313, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_processes_pids{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Number of PIDs", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_processes_max_processes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "PIDs limit", + "refId": "B", + "step": 240 + } + ], + "title": "PIDs Number and Limit", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*waiting.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 93 + }, + "id": 305, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{ cpu }} - seconds spent running a process", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{ cpu }} - seconds spent by processing waiting for this CPU", + "refId": "B", + "step": 240 + } + ], + "title": "Process schedule stats Running / Waiting", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Enable with --collector.processes argument on node-exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Threads limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 103 + }, + "id": 314, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_processes_threads{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Allocated threads", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_processes_max_threads{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Threads limit", + "refId": "B", + "step": 240 + } + ], + "title": "Threads Number and Limit", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "System Processes", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 269, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 8, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_context_switches_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Context switches", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_intr_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Interrupts", + "refId": "B", + "step": 240 + } + ], + "title": "Context Switches / Interrupts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 7, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_load1{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Load 1m", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_load5{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Load 5m", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_load15{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Load 15m", + "refId": "C", + "step": 240 + } + ], + "title": "System Load", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Max" + }, + "properties": [ + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 10 + }, + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": false, + "viz": false + } + }, + { + "id": "custom.fillBelowTo", + "value": "Min" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Min" + }, + "properties": [ + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + }, + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": false, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 36 + }, + "id": 321, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "node_cpu_scaling_frequency_hertz{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{ cpu }}", + "range": true, + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "avg(node_cpu_scaling_frequency_max_hertz{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Max", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "avg(node_cpu_scaling_frequency_min_hertz{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Min", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "CPU Frequency Scaling", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "https://docs.kernel.org/accounting/psi.html", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Memory some" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory full" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "light-red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "I/O some" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "I/O full" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "light-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 36 + }, + "id": 322, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "CPU some", + "range": true, + "refId": "CPU some", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Memory some", + "range": true, + "refId": "Memory some", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(node_pressure_memory_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Memory full", + "range": true, + "refId": "Memory full", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "I/O some", + "range": true, + "refId": "I/O some", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(node_pressure_io_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "I/O full", + "range": true, + "refId": "I/O full", + "step": 240 + } + ], + "title": "Pressure Stall Information", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Enable with --collector.interrupts argument on node-exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Critical*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 46 + }, + "id": 259, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_interrupts_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ type }} - {{ info }}", + "refId": "A", + "step": 240 + } + ], + "title": "Interrupts Detail", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 46 + }, + "id": 306, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_schedstat_timeslices_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{ cpu }}", + "refId": "A", + "step": 240 + } + ], + "title": "Schedule timeslices executed by each cpu", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 151, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_entropy_available_bits{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Entropy available to random number generators", + "refId": "A", + "step": 240 + } + ], + "title": "Entropy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 308, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(process_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Time spent", + "refId": "A", + "step": 240 + } + ], + "title": "CPU time spent in user and system contexts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 66 + }, + "id": 64, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "process_max_fds{instance=\"$node\",job=\"$job\"}", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Maximum open file descriptors", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "process_open_fds{instance=\"$node\",job=\"$job\"}", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Open file descriptors", + "refId": "B", + "step": 240 + } + ], + "title": "File Descriptors", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "System Misc", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 304, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "temperature", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Critical*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 59 + }, + "id": 158, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_hwmon_temp_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ chip_name }} {{ sensor }} temp", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_hwmon_temp_crit_alarm_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ chip_name }} {{ sensor }} Critical Alarm", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_hwmon_temp_crit_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ chip_name }} {{ sensor }} Critical", + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_hwmon_temp_crit_hyst_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ chip_name }} {{ sensor }} Critical Historical", + "refId": "D", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_hwmon_temp_max_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ chip_name }} {{ sensor }} Max", + "refId": "E", + "step": 240 + } + ], + "title": "Hardware temperature monitor", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 59 + }, + "id": 300, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_cooling_device_cur_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Current {{ name }} in {{ type }}", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_cooling_device_max_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Max {{ name }} in {{ type }}", + "refId": "B", + "step": 240 + } + ], + "title": "Throttle cooling device", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 69 + }, + "id": 302, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_power_supply_online{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ power_supply }} online", + "refId": "A", + "step": 240 + } + ], + "title": "Power supply", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "Hardware Misc", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 296, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 46 + }, + "id": 297, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_systemd_socket_accepted_connections_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ name }} Connections", + "refId": "A", + "step": 240 + } + ], + "title": "Systemd Sockets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FF9830", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#73BF69", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Deactivating" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FFCB7D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Activating" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C8F2C2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 46 + }, + "id": 298, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"activating\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Activating", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"active\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Active", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"deactivating\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Deactivating", + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"failed\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Failed", + "refId": "D", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"inactive\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Inactive", + "refId": "E", + "step": 240 + } + ], + "title": "Systemd Units State", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "Systemd", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 270, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The number (after merges) of I/O requests completed per second for the device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "IO read (-) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "iops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 9, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "intervalFactor": 4, + "legendFormat": "{{device}} - Reads completed", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "intervalFactor": 1, + "legendFormat": "{{device}} - Writes completed", + "refId": "B", + "step": 240 + } + ], + "title": "Disk IOps Completed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The number of bytes read from or written to the device per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes read (-) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 33, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "{{device}} - Read bytes", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Written bytes", + "refId": "B", + "step": 240 + } + ], + "title": "Disk R/W Data", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "time. read (-) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 57 + }, + "id": 37, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_read_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "hide": false, + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - Read wait time avg", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_write_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} - Write wait time avg", + "refId": "B", + "step": 240 + } + ], + "title": "Disk Average Wait Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The average queue length of the requests that were issued to the device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "aqu-sz", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 57 + }, + "id": 35, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_io_time_weighted_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}}", + "refId": "A", + "step": 240 + } + ], + "title": "Average Queue Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The number of read and write requests merged per second that were queued to the device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "I/Os", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "iops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 67 + }, + "id": 133, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_reads_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "intervalFactor": 1, + "legendFormat": "{{device}} - Read merged", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_writes_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "intervalFactor": 1, + "legendFormat": "{{device}} - Write merged", + "refId": "B", + "step": 240 + } + ], + "title": "Disk R/W Merged", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Percentage of elapsed time during which I/O requests were issued to the device (bandwidth utilization for the device). Device saturation occurs when this value is close to 100% for devices serving requests serially. But for devices serving requests in parallel, such as RAID arrays and modern SSDs, this number does not reflect their performance limits.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "%util", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 67 + }, + "id": 36, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - IO", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_discard_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - discard", + "refId": "B", + "step": 240 + } + ], + "title": "Time Spent Doing I/Os", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The number of outstanding requests at the instant the sample was taken. Incremented as requests are given to appropriate struct request_queue and decremented as they finish.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Outstanding req.", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 77 + }, + "id": 34, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_disk_io_now{instance=\"$node\",job=\"$job\"}", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - IO now", + "refId": "A", + "step": 240 + } + ], + "title": "Instantaneous Queue Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "IOs", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "iops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 77 + }, + "id": 301, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_discards_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - Discards completed", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_disk_discards_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} - Discards merged", + "refId": "B", + "step": 240 + } + ], + "title": "Disk IOps Discards completed / merged", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "Storage Disk", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 271, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 62 + }, + "id": 43, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - Available", + "metric": "", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_filesystem_free_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - Free", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - Size", + "refId": "C", + "step": 240 + } + ], + "title": "Filesystem space available", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "file nodes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 62 + }, + "id": 41, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_filesystem_files_free{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - Free file nodes", + "refId": "A", + "step": 240 + } + ], + "title": "File Nodes Free", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "files", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 72 + }, + "id": 28, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_filefd_maximum{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Max open files", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_filefd_allocated{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Open files", + "refId": "B", + "step": 240 + } + ], + "title": "File Descriptor", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "file Nodes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 72 + }, + "id": 219, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_filesystem_files{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - File nodes total", + "refId": "A", + "step": 240 + } + ], + "title": "File Nodes Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "/ ReadOnly" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 82 + }, + "id": 44, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_filesystem_readonly{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - ReadOnly", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_filesystem_device_error{instance=\"$node\",job=\"$job\",device!~'rootfs',fstype!~'tmpfs'}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - Device error", + "refId": "B", + "step": 240 + } + ], + "title": "Filesystem in ReadOnly / Error", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "Storage Filesystem", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 272, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "receive_packets_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "receive_packets_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "transmit_packets_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "transmit_packets_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 60, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_receive_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_transmit_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit", + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic by Packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 142, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_receive_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive errors", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_transmit_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit errors", + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 57 + }, + "id": 143, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_receive_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive drop", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_transmit_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit drop", + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Drop", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 57 + }, + "id": 141, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_receive_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive compressed", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_transmit_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit compressed", + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Compressed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 67 + }, + "id": 146, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_receive_multicast_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive multicast", + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Multicast", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 67 + }, + "id": 144, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_receive_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive fifo", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_transmit_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit fifo", + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Fifo", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 77 + }, + "id": 145, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_receive_frame_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive frame", + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Frame", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 77 + }, + "id": 231, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_transmit_carrier_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Statistic transmit_carrier", + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Carrier", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 87 + }, + "id": 232, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_network_transmit_colls_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit colls", + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Colls", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "entries", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "NF conntrack limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 87 + }, + "id": 61, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_nf_conntrack_entries{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "NF conntrack entries", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_nf_conntrack_entries_limit{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "NF conntrack limit", + "refId": "B", + "step": 240 + } + ], + "title": "NF Conntrack", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Entries", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 97 + }, + "id": 230, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_arp_entries{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ device }} - ARP entries", + "refId": "A", + "step": 240 + } + ], + "title": "ARP Entries", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 97 + }, + "id": 288, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_network_mtu_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ device }} - Bytes", + "refId": "A", + "step": 240 + } + ], + "title": "MTU", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 107 + }, + "id": 280, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_network_speed_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ device }} - Speed", + "refId": "A", + "step": 240 + } + ], + "title": "Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 107 + }, + "id": 289, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_network_transmit_queue_length{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ device }} - Interface transmit queue length", + "refId": "A", + "step": 240 + } + ], + "title": "Queue Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packetes drop (-) / process (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Dropped.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 117 + }, + "id": 290, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_softnet_processed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{cpu}} - Processed", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_softnet_dropped_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{cpu}} - Dropped", + "refId": "B", + "step": 240 + } + ], + "title": "Softnet Packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 117 + }, + "id": 310, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_softnet_times_squeezed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{cpu}} - Squeezed", + "refId": "A", + "step": 240 + } + ], + "title": "Softnet Out of Quota", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 127 + }, + "id": 309, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_network_up{operstate=\"up\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{interface}} - Operational state UP", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_network_carrier{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "instant": false, + "legendFormat": "{{device}} - Physical link state", + "refId": "B" + } + ], + "title": "Network Operational Status", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "Network Traffic", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 273, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 63, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_TCP_alloc{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCP_alloc - Allocated sockets", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_TCP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCP_inuse - Tcp sockets currently in use", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_TCP_mem{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCP_mem - Used memory for tcp", + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_TCP_orphan{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCP_orphan - Orphan sockets", + "refId": "D", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_TCP_tw{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCP_tw - Sockets waiting close", + "refId": "E", + "step": 240 + } + ], + "title": "Sockstat TCP", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 124, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_UDPLITE_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "UDPLITE_inuse - Udplite sockets currently in use", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_UDP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "UDP_inuse - Udp sockets currently in use", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_UDP_mem{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "UDP_mem - Used memory for udp", + "refId": "C", + "step": 240 + } + ], + "title": "Sockstat UDP", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 58 + }, + "id": 125, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_FRAG_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "FRAG_inuse - Frag sockets currently in use", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_RAW_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "RAW_inuse - Raw sockets currently in use", + "refId": "C", + "step": 240 + } + ], + "title": "Sockstat FRAG / RAW", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 58 + }, + "id": 220, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_TCP_mem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "mem_bytes - TCP sockets in that state", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_UDP_mem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "mem_bytes - UDP sockets in that state", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_FRAG_memory{instance=\"$node\",job=\"$job\"}", + "interval": "", + "intervalFactor": 1, + "legendFormat": "FRAG_memory - Used memory for frag", + "refId": "C" + } + ], + "title": "Sockstat Memory Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "sockets", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 68 + }, + "id": 126, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_sockstat_sockets_used{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Sockets_used - Sockets currently in use", + "refId": "A", + "step": 240 + } + ], + "title": "Sockstat Used", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "Network Sockstat", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 274, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "octets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 221, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_IpExt_InOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "InOctets - Received octets", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_IpExt_OutOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "OutOctets - Sent octets", + "refId": "B", + "step": 240 + } + ], + "title": "Netstat IP In / Out Octets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "datagrams", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 81, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Ip_Forwarding{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Forwarding - IP forwarding", + "refId": "A", + "step": 240 + } + ], + "title": "Netstat IP Forwarding", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "messages out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 43 + }, + "id": 115, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Icmp_InMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "InMsgs - Messages which the entity received. Note that this counter includes all those counted by icmpInErrors", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Icmp_OutMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "OutMsgs - Messages which this entity attempted to send. Note that this counter includes all those counted by icmpOutErrors", + "refId": "B", + "step": 240 + } + ], + "title": "ICMP In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "messages out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 43 + }, + "id": 50, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Icmp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "InErrors - Messages which the entity received but determined as having ICMP-specific errors (bad ICMP checksums, bad length, etc.)", + "refId": "A", + "step": 240 + } + ], + "title": "ICMP Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "datagrams out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Snd.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 53 + }, + "id": 55, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Udp_InDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "InDatagrams - Datagrams received", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Udp_OutDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "OutDatagrams - Datagrams sent", + "refId": "B", + "step": 240 + } + ], + "title": "UDP In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "datagrams", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 53 + }, + "id": 109, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Udp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "InErrors - UDP Datagrams that could not be delivered to an application", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Udp_NoPorts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "NoPorts - UDP Datagrams received on a port with no listener", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_UdpLite_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "InErrors Lite - UDPLite Datagrams that could not be delivered to an application", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Udp_RcvbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "RcvbufErrors - UDP buffer errors received", + "refId": "D", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Udp_SndbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "SndbufErrors - UDP buffer errors send", + "refId": "E", + "step": 240 + } + ], + "title": "UDP Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "datagrams out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Snd.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 63 + }, + "id": 299, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Tcp_InSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "InSegs - Segments received, including those received in error. This count includes segments received on currently established connections", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Tcp_OutSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "OutSegs - Segments sent, including those on current connections but excluding those containing only retransmitted octets", + "refId": "B", + "step": 240 + } + ], + "title": "TCP In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 63 + }, + "id": 104, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_TcpExt_ListenOverflows{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "ListenOverflows - Times the listen queue of a socket overflowed", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_TcpExt_ListenDrops{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "ListenDrops - SYNs to LISTEN sockets ignored", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_TcpExt_TCPSynRetrans{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCPSynRetrans - SYN-SYN/ACK retransmits to break down retransmissions in SYN, fast/timeout retransmits", + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Tcp_RetransSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "RetransSegs - Segments retransmitted - that is, the number of TCP segments transmitted containing one or more previously transmitted octets", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Tcp_InErrs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "InErrs - Segments received in error (e.g., bad TCP checksums)", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Tcp_OutRsts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "OutRsts - Segments sent with RST flag", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_TCPRcvQDrop{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "hide": false, + "interval": "", + "legendFormat": "TCPRcvQDrop - Packets meant to be queued in rcv queue but dropped because socket rcvbuf limit hit", + "range": true, + "refId": "G" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_TCPOFOQueue{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "hide": false, + "interval": "", + "legendFormat": "TCPOFOQueue - TCP layer receives an out of order packet and has enough memory to queue it", + "range": true, + "refId": "H" + } + ], + "title": "TCP Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "connections", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*MaxConn *./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 73 + }, + "id": 85, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_netstat_Tcp_CurrEstab{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "CurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_netstat_Tcp_MaxConn{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "MaxConn - Limit on the total number of TCP connections the entity can support (Dynamic is \"-1\")", + "refId": "B", + "step": 240 + } + ], + "title": "TCP Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Sent.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 73 + }, + "id": 91, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_TcpExt_SyncookiesFailed{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "SyncookiesFailed - Invalid SYN cookies received", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_TcpExt_SyncookiesRecv{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "SyncookiesRecv - SYN cookies received", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_TcpExt_SyncookiesSent{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "SyncookiesSent - SYN cookies sent", + "refId": "C", + "step": 240 + } + ], + "title": "TCP SynCookie", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "connections", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 83 + }, + "id": 82, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Tcp_ActiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "ActiveOpens - TCP connections that have made a direct transition to the SYN-SENT state from the CLOSED state", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "irate(node_netstat_Tcp_PassiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "PassiveOpens - TCP connections that have made a direct transition to the SYN-RCVD state from the LISTEN state", + "refId": "B", + "step": 240 + } + ], + "title": "TCP Direct Transition", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Enable with --collector.tcpstat argument on node-exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "connections", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 83 + }, + "id": 320, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"established\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "established - TCP sockets in established state", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"fin_wait2\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "fin_wait2 - TCP sockets in fin_wait2 state", + "range": true, + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"listen\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "listen - TCP sockets in listen state", + "range": true, + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"time_wait\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "time_wait - TCP sockets in time_wait state", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "TCP Stat", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "Network Netstat", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 279, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 66 + }, + "id": 40, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_scrape_collector_duration_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{collector}} - Scrape duration", + "refId": "A", + "step": 240 + } + ], + "title": "Node Exporter Scrape Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*error.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + }, + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 66 + }, + "id": 157, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_scrape_collector_success{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{collector}} - Scrape success", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_textfile_scrape_error{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{collector}} - Scrape textfile error (1 = true)", + "refId": "B", + "step": 240 + } + ], + "title": "Node Exporter Scrape", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "refId": "A" + } + ], + "title": "Node Exporter", + "type": "row" + } + ], + "refresh": "1m", + "revision": 1, + "schemaVersion": 38, + "style": "dark", + "tags": [ + "linux" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": { + "query": "label_values(node_uname_info, job)", + "refId": "Prometheus-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(node_uname_info{job=\"$job\"}, instance)", + "hide": 0, + "includeAll": false, + "label": "Host", + "multi": false, + "name": "node", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"$job\"}, instance)", + "refId": "Prometheus-node-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" + }, + "hide": 2, + "includeAll": false, + "multi": false, + "name": "diskdevices", + "options": [ + { + "selected": true, + "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" + } + ], + "query": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Node Exporter Full", + "uid": "rYdddlPWk", + "version": 92, + "weekStart": "" +} \ No newline at end of file diff --git a/playbooks/roles/grafana/files/variables.libsonnet b/playbooks/roles/grafana/files/variables.libsonnet new file mode 100644 index 00000000..c54f54aa --- /dev/null +++ b/playbooks/roles/grafana/files/variables.libsonnet @@ -0,0 +1,120 @@ +local g = import './g.libsonnet'; +local var = g.dashboard.variable; + +{ + prometheus: + var.datasource.new('PROMETHEUS_DS', 'prometheus') + + var.datasource.generalOptions.showOnDashboard.withValueOnly(), + + compartment: + var.query.new('compartment') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('compartment', 'up') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + hpc_island: + var.query.new('hpc_island') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('hpc_island', 'up') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + rack_id: + var.query.new('rackID') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('rackID', 'up') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + rail_id: + var.query.new('rail_id') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('rail_id', 'up') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + availability_domain: + var.query.new('AD') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('AD', 'up') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + oci_name: + var.query.new('oci_name') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('oci_name', 'up') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + hostname: + var.query.new('hostname') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('hostname', 'up') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + mountpoint: + var.query.new('mountpoint') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('mountpoint', 'node_filesystem_free_bytes') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + fstype: + var.query.new('fstype') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('fstype', 'node_filesystem_free_bytes') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + interface: + var.query.new('interface') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('interface', 'rdma_np_ecn_marked_roce_packets') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + device: + var.query.new('device') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('device', 'node_network_receive_bytes_total') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + gpu: + var.query.new('gpu') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('gpu', 'DCGM_FI_DEV_GPU_UTIL') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + cluster: + var.query.new('cluster_name') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('cluster_name', 'up') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), + + fss_mount: + var.query.new('fss_mount') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.queryTypes.withLabelValues('fss_mount', 'up') + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll() + + var.query.withRefresh(1), +} diff --git a/playbooks/roles/grafana/tasks/dashboard.yml b/playbooks/roles/grafana/tasks/dashboard.yml new file mode 100644 index 00000000..1ad9372f --- /dev/null +++ b/playbooks/roles/grafana/tasks/dashboard.yml @@ -0,0 +1,78 @@ +--- +- name: Download jb + become: true + get_url: + url: https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/v0.6.0/jb-linux-amd64 + dest: /usr/local/bin/jb + mode: '0755' + +- name: Download jsonnet + become: true + get_url: + url: https://github.com/google/go-jsonnet/releases/download/v0.20.0/go-jsonnet_0.20.0_Linux_x86_64.tar.gz + dest: /tmp + +- name: Extract jsonnet + become: true + unarchive: + src: /tmp/go-jsonnet_0.20.0_Linux_x86_64.tar.gz + dest: /usr/local/bin + mode: '0755' + remote_src: true + + +- name: Delete existing build directory + file: + path: "{{ dashboard_build_dir }}" + state: absent + +- name: Create temporary build directory + ansible.builtin.file: + path: "{{ dashboard_build_dir }}" + state: directory + mode: '0755' + owner: "{{ user }}" + group: "{{ user }}" + +- name: Copy dashboard source files to build directory + copy: + src: "{{ item }}" + dest: "{{ dashboard_build_dir }}" + owner: "{{ user }}" + group: "{{ user }}" + loop: + - main.jsonnet + - g.libsonnet + - variables.libsonnet + +- name: Build dashboard with Jsonnet + ansible.builtin.shell: | + /usr/local/bin/jb init + /usr/local/bin/jb install {{ grafonnet_gen_repo_url }} + /usr/local/bin/jb install {{ grafonnet_lib_repo_url }} + /usr/local/bin/jsonnet -J vendor main.jsonnet -o cluster_prometheus_v2.json + args: + chdir: "{{ dashboard_build_dir }}" + +- name: Ensure /opt/oci-hpc/monitoring exists + become: true + file: + path: "/opt/oci-hpc/monitoring" + state: directory + owner: '{{ ansible_user }}' + group: '{{ ansible_user }}' + +- name: Save generated dashboard + copy: + src: "{{ dashboard_build_dir }}/cluster_prometheus_v2.json" + dest: "/opt/oci-hpc/monitoring/cluster_prometheus_v2.json" + remote_src: true + +- name: Import NodeExporter, DCGM, RDMA, NVLink Grafana dashboards + community.grafana.grafana_dashboard: + grafana_url: http://localhost:3000 + state: present + commit_message: Updated by ansible + overwrite: true + path: "{{ dashboard_build_dir }}/cluster_prometheus_v2.json" + diff --git a/playbooks/roles/grafana/tasks/el.yml b/playbooks/roles/grafana/tasks/el.yml old mode 100755 new mode 100644 index ce937288..4e6b4a03 --- a/playbooks/roles/grafana/tasks/el.yml +++ b/playbooks/roles/grafana/tasks/el.yml @@ -43,44 +43,13 @@ no_log: false register: existing_api_keys -# adding ignore errors as yes because of an issue with grafana >=9.1 versions - https://github.com/grafana/grafana/issues/53995 -- name: Create influxdb datasource +- name: Create prometheus datasource grafana_datasource: - name: "InfluxDB" + name: "Prometheus" grafana_url: "{{ grafana_api_url }}" grafana_user: "{{ grafana_security.admin_user }}" grafana_password: "{{ grafana_security.admin_password }}" - org_id: "1" - ds_type: "influxdb" - ds_url: "http://localhost:8086" - database: "telegraf" - time_interval: ">10s" + ds_type: "prometheus" + ds_url: "http://localhost:9090" ignore_errors: yes -- name: Import grafana dashboards through API - uri: - url: "{{ grafana_api_url }}/api/dashboards/db" - user: "{{ grafana_security.admin_user }}" - password: "{{ grafana_security.admin_password }}" - force_basic_auth: true - method: POST - body_format: json - body: > - { - "dashboard": {{ lookup("file", item) }}, - "overwrite": true, - "message": "Updated by ansible" - } - no_log: false - with_fileglob: - - files/cluster.json - -#- name: Import Grafana dashboard foo -# community.grafana.grafana_dashboard: -# grafana_url: "{{ grafana_api_url }}" -# grafana_user: "{{ grafana_security.admin_user }}" -# grafana_password: "{{ grafana_security.admin_password }}" -# state: present -# message: Updated by ansible -# overwrite: yes -# path: files/cluster.json diff --git a/playbooks/roles/grafana/tasks/main.yml b/playbooks/roles/grafana/tasks/main.yml old mode 100755 new mode 100644 index 6b947a1b..69819525 --- a/playbooks/roles/grafana/tasks/main.yml +++ b/playbooks/roles/grafana/tasks/main.yml @@ -2,3 +2,5 @@ when: ansible_os_family == 'RedHat' - include_tasks: ubuntu.yml when: ansible_os_family == 'Debian' +- include_tasks: dashboard.yml + when: ansible_os_family == 'RedHat' or ansible_os_family == 'Debian' diff --git a/playbooks/roles/grafana/tasks/ubuntu.yml b/playbooks/roles/grafana/tasks/ubuntu.yml index 45b9ac83..55988fc6 100644 --- a/playbooks/roles/grafana/tasks/ubuntu.yml +++ b/playbooks/roles/grafana/tasks/ubuntu.yml @@ -1,48 +1,43 @@ --- -# - name: add grafana repository -# become: true -# apt_repository: -# name: grafana -# description: grafana -# baseurl: https://packages.grafana.com/oss/deb stable main -# repo_gpgcheck: 1 -# enabled: 1 -# gpgcheck: 1 -# gpgkey: https://packages.grafana.com/gpg.key -# sslverify: 1 -# sslcacert: /etc/pki/tls/certs/ca-bundle.crt - -- name: Add grafana key +- name: Add Grafana GPG key become: true apt_key: - state: present url: https://packages.grafana.com/gpg.key + state: present + keyring: /etc/apt/trusted.gpg.d/grafana-archive-keyring.gpg -- name: Download grafana 8.5.21 package - get_url: - url: https://dl.grafana.com/oss/release/grafana_8.5.21_amd64.deb - dest: /tmp/grafana_8.5.21_amd64.deb +- name: Add Grafana repository + become: true + apt_repository: + repo: 'deb https://packages.grafana.com/oss/deb stable main' + state: present + filename: 'grafana' + +- name: Update apt cache + become: true + apt: + update_cache: yes -- name: Install grafana 8.5.21 package +- name: Install Grafana package become: true - ansible.builtin.apt: - deb: /tmp/grafana_8.5.21_amd64.deb + apt: + name: grafana state: present -- name: start grafana +- name: Start and enable Grafana service become: true - service: + service: name: grafana-server - state: restarted - enabled: true + state: started + enabled: yes -- name: Ensure grafana key directory exists +- name: Ensure Grafana key directory exists file: path: "/etc/opt/oci-hpc/passwords/grafana" state: directory delegate_to: localhost -- name: Check api key list +- name: Check Grafana API key list uri: url: "{{ grafana_api_url }}/api/auth/keys" method: GET @@ -56,44 +51,12 @@ retries: 5 delay: 5 -# adding ignore errors as yes because of an issue with grafana >=9.1 versions - https://github.com/grafana/grafana/issues/53995 -- name: Create influxdb datasource +- name: Create Prometheus datasource in Grafana grafana_datasource: - name: "InfluxDB" + name: "Prometheus" grafana_url: "{{ grafana_api_url }}" grafana_user: "{{ grafana_security.admin_user }}" grafana_password: "{{ grafana_security.admin_password }}" - org_id: "1" - ds_type: "influxdb" - ds_url: "http://localhost:8086" - database: "telegraf" - time_interval: ">10s" - ignore_errors: true - -- name: Import grafana dashboards through API - uri: - url: "{{ grafana_api_url }}/api/dashboards/db" - user: "{{ grafana_security.admin_user }}" - password: "{{ grafana_security.admin_password }}" - force_basic_auth: true - method: POST - body_format: json - body: > - { - "dashboard": {{ lookup("file", item) }}, - "overwrite": true, - "message": "Updated by ansible" - } - no_log: false - with_fileglob: - - files/cluster.json - -#- name: Import Grafana dashboard foo -# community.grafana.grafana_dashboard: -# grafana_url: "{{ grafana_api_url }}" -# grafana_user: "{{ grafana_security.admin_user }}" -# grafana_password: "{{ grafana_security.admin_password }}" -# state: present -# message: Updated by ansible -# overwrite: yes -# path: files/cluster.json + ds_type: "prometheus" + ds_url: "http://localhost:9090" + ignore_errors: yes diff --git a/playbooks/roles/healthchecks/defaults/main.yml b/playbooks/roles/healthchecks/defaults/main.yml new file mode 100644 index 00000000..d260b8a4 --- /dev/null +++ b/playbooks/roles/healthchecks/defaults/main.yml @@ -0,0 +1,2 @@ +mp_download_link: "https://objectstorage.us-phoenix-1.oraclecloud.com/p/f0EgpgGOU5FlPzDwbSHuxdVQc7A8VgOaThd5KARkDG8Y60QJc53xhg-6m6nlyxRI/n/iding8g8fv8l/b/bm_meshpinger_artifacts/o/meshpinger_bm.tar.gz" +mp_filename: "meshpinger_bm.tar.gz" \ No newline at end of file diff --git a/playbooks/roles/healthchecks/files/check_gpu_setup.py b/playbooks/roles/healthchecks/files/check_gpu_setup.py index ed9132c0..89641103 100644 --- a/playbooks/roles/healthchecks/files/check_gpu_setup.py +++ b/playbooks/roles/healthchecks/files/check_gpu_setup.py @@ -68,7 +68,7 @@ def get_oca_version(): if version < "1.39.0": - logger.error(f"Oracle Cloud Agent: {version} needs to be updated to 1.38.0 or higher") + logger.error(f"Oracle Cloud Agent: {version} needs to be updated to 1.39.0 or higher") else: logger.info(f"Oracle Cloud Agent: {version}") @@ -77,7 +77,14 @@ def get_oca_version(): def check_rttcc_status(): link_status = [] - devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] + metadata=get_metadata() + shape=metadata['shape'] + if shape == "BM.GPU.H100.8": + devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] + elif shape == "BM.GPU.B4.8" or shape == "BM.GPU.A100-v2.8": + devices = ["mlx5_1", "mlx5_2", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_11", "mlx5_12", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] + elif shape == "BM.GPU4.8": + devices = ["mlx5_0", "mlx5_1", "mlx5_2", "mlx5_3", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_11", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] status = "disabled" status_dict = {"devices": {}} for device in devices: @@ -175,13 +182,13 @@ def check_row_remap_errors(): continue tmp_data = line.split(",") tmp_data = [x.strip() for x in tmp_data] - if tmp_data[0] != "0": + if tmp_data[0] != "0" and tmp_data[0] != "No": logger.debug(f"GPU: {i} - Row Remap Pending: {tmp_data[0]}") remap_issues.append(f"GPU: {i} Row Remap Pending: {tmp_data[0]}") - if tmp_data[1] != "0": + if tmp_data[1] != "0" and tmp_data[0] != "No": logger.debug(f"GPU: {i} - Row Remap Failure: {tmp_data[1]}") #remap_issues.append(f"GPU: {i} Row Remap Failure: {tmp_data[1]}") - if tmp_data[2] != "0": + if tmp_data[2] != "0" and tmp_data[0] != "No": logger.debug(f"GPU: {i} - Row Remap Uncorrectable: {tmp_data[2]}") if int(tmp_data[2]) > 512: remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable >512: {tmp_data[2]}") @@ -246,7 +253,7 @@ def check_rdma_link_status(): logger.debug(f"{device}: {link_state}") link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {link_state}") status = False - if recommendation != "No issue was observed": + if not "No issue was observed" in recommendation: logger.debug(f"{device}: {recommendation}") if "Bad signal integrity" in recommendation and float(physical_BER) < 1e-07: logger.debug(f"Recommandation is {recommendation} but the Physical error are low enough that it can be ignored") diff --git a/playbooks/roles/healthchecks/files/meshpinger_readme.md b/playbooks/roles/healthchecks/files/meshpinger_readme.md new file mode 100644 index 00000000..b3ff1f06 --- /dev/null +++ b/playbooks/roles/healthchecks/files/meshpinger_readme.md @@ -0,0 +1,183 @@ + +# OCI Meshpinger + +Meshpinger is a tool for validating network layer connectivity between RDMA NICs on a +cluster network in OCI. The tool initiates an ICMP ping from every RDMA NIC +port on the cluster network to every other RDMA NIC port on the same cluster network and +reports back the success/failure status of the pings performed in the form of logs + +Running the tool before starting workload on a cluster network should serve as a good precheck +step to gain confidence on the network reachability between RDMA NICs. Typical causes for +reachability failures that the tool can help pinpoint are, + +1. Host rdma interface down + +2. Host rdma interface missing IP configuration + +3. Host rdma interface missing mac + +4. Host rdma interface enumeration issues + +5. Network connectivity issues between pair of IPs + +# Running Meshpinger + +Meshpinger is installed on the controller host of the hpc cluster. Once user is logged into the controller host, they can trigger meshpinger using the following options, + +- If controller host is supporting only one cluster, run meshpinger on all hosts in that cluster. The cluster is auto-detected in this option. +``` +/opt/oci-hpc/healthchecks/run_meshpinger.sh +``` + +- If controller host is supporting more than 1 cluster, run meshpinger on all hosts in a cluster explicitly specified by its clustername +``` +/opt/oci-hpc/healthchecks/run_meshpinger.sh --hpcclustername +``` + +Run meshpinger on a list of hosts specified in a file. A host can be specified by its IP address or hostname. It is expected that the host will be SSH-able from the controller host +``` +/opt/oci-hpc/healthchecks/run_meshpinger.sh --hostlisttfile +``` + +# Output + +- All rdma interface configuration issues are reported like the sample below, + +``` +Faulty RDMA interfaces(Link down/misconfigured) + + Hostid/Serial/hostname Interface RDMA_IP PCI MAC Link Status +-- -------------------------- ----------- --------- ------------ ----------------- ------------- + 0 GPU-711/2109XCL016/GPU-711 rdma1 0.0.0.0 0000:98:00.1 b8:ce:f6:00:12:29 DOWN + 1 GPU-278/2110XCL04V/GPU-278 rdma1 0.0.0.0 0000:98:00.1 04:3f:72:e0:6b:0d DOWN +``` + +- If there are ping failures from the run, total number of unique pings that failed per host is printed as a table like the sample below, + +``` +ICMP ping failures per host + + Hostid/Serial/Hostname Total Failures +-- -------------------------- ---------------- + 0 GPU-711/2109XCL016/GPU-711 1 + 1 GPU-278/2110XCL04V/GPU-278 1 +``` +Logfile of the current run that enumerates all combinations that failed ping is printed like, + +``` + interfaces that failed ping are listed at end of the log file meshpinger_log_20241008220615_ocid1.tenancy.oc1..aaaaaaaabddc4obuhgvifcrh6esmw6554ityaqrvxulcksl255gbwehtcq.txt +``` + + +- If there are no ping failures from the run, following message is printed + +``` +All pings succeeded!! +``` +- Cluster information that includes rdma interface details gathered from the run is stored in a file cluster_info.txt in the current directory, same is printed as below, +``` +clusterinfo file - cluster_info.txt +``` + +# Options +Other options supported are shown in the help text below. + +``` +/opt/oci-hpc/healthchecks/run_meshpinger.sh --help + +usage: ./run_meshpinger.sh [-h] + [--hostlistfile HOSTLISTFILE | --hpcclustername HPCCLUSTERNAME] + [--clusterinfo CLUSTERINFO] [--ssh_port SSH_PORT] + [--ping_timeout PING_TIMEOUT] + [--dump_arp_on_failure] [--flush_arp] + [--nic_model NIC_MODEL] + [--objectstoreurl OBJECTSTOREURL] [--enable_inter_rail_ping] + [--threads_per_intf THREADS_PER_INTF] [--verbose] + +optional arguments: + -h, --help show this help message and exit + --hostlistfile HOSTLISTFILE + File listing name/ip of the hosts to include in + meshping + --hpcclustername HPCCLUSTERNAME + OCI HPC stack clustername + --clusterinfo CLUSTERINFO + Use this cluster info file (generated from previous + runs) and skip gathering cluster information in this + run + --ssh_port SSH_PORT ssh port to use, port 22 will be used if not specified + --ping_timeout PING_TIMEOUT + Duration ping waits for reply before timing out, + default is 1sec + --dump_arp_on_failure + Log arp entry for failed pings + --flush_arp Flush arp cache before starting pinger + --nic_model NIC_MODEL + Model of the RDMA NIC eg. MT2910(CX-7) to use if auto + detect fails + --objectstoreurl OBJECTSTOREURL + ObjectStore PAR URL where mesh pinger logs will be + uploaded + --enable_inter_rail_ping + Include this argument to perform pings across the rails. + If so pinger will do a full mesh ping + --threads_per_intf THREADS_PER_INTF + parallel ping threads per local rdma interface, + default is 16 + --verbose Log all debug messages including successful pings. + Default is to log only failed pings +``` + +# Description +Detailed description of each option is below, + +**--hostlistfile** + +Path to file containing the list of hosts to be used for current meshpinger run. A host can be specified by its IP address or its hostname but it should be SSH-able via either of these 2 strings specified. String specified here is listed as Hostid on the final report of meshping run + +**--hpcclustername** + +Clustername specified when the cluster was created using OCI HPC stack + + +**--clusterinfo** + +File containing cluster information generated from a previous meshpinger run. When this is specified, current run will skip gathering RDMA interface details from the hosts and move on to doing actual meshping tests saving some runtime. Note that specifying this option forces meshpinger to use RDMA interface details collected previously which could be stale especially for attributes like link state, IP assignment + +**--ssh_port** + +Port to use for ssh to hosts specified in the hostlistfile. By default port 22 will be used if this is not specified + +**--ping_timeout** + +Time in milliseconds that ping waits for a successful reply from remote IP including the time it takes for arp resolution. This timeout is 1 second by default if this option is not specified and overall meshpinger performs 10 retries for each of the remote IPs before marking it as a ping failure + +**--dump_arp_on_failure** + +When this option is specified, for each of the ping failures the corresponding arp table entry(including the status field) for the remote IP on the local host will be dumped in meshpinger logs. By default this is disabled + +**--flush_arp** + +When this option is specified, meshpinger will flush the arp table on each of the hosts before starting the ping validation test + +**--nic_model** + +NIC model to use (e.g MT2910 for CX-7) for filtering out RDMA interfaces from front-end network interfaces while gathering RDMA interface information on each host. By default, meshpinger determines the model based on the model of majority of interfaces on the host given that backend network interface count always exceeds frontend network interface count. + +**--objectstoreurl** + +Pre-Authenticated Request(PAR) url where meshpinger logs will be uploaded. This can be used by customers to easily share meshpinger logs with OCI during any incidents. OCI can provide a PAR to objectstore bucket and share it with customer to enable sharing of meshpinger logs. + +**--enable_inter_rail_ping** + +This option specifies all rdma interfaces on hosts in the hostlist file are part of a single subnet. In this case meshpinger will do pings to all remote IPs from all local interfaces on a given host. It is to be noted that when this option is chosen net.ipv4.neigh.default.gc_threshX [X=1-3] sysctl setting on every host may need to be bumped up to hold the necessary arp entries per local interface. Eg. For running meshpinger on a 512 host cluster with each host having 16 rdma interface, size of the arp table should be atleast 130816(511 * 16 * 16). Accordingly it is recommended to set all the 3 sysctl thresholds - net.ipv4.neigh.default.gc_threshX[X=1-3] to 130816. By default, meshpinger only pings along the rails. + +**--threads_per_intf** + +By default meshpinger running on each of the hosts in the hostlist file uses 16 parallel threads per local interface to perform parallel pings. This option overrides that setting with allowed values of 1-32 + +**--verbose** + +By default only ping failures are logged to limit the log file size. When this option is specified succeeding pings are also logged + + diff --git a/playbooks/roles/healthchecks/files/run_meshpinger.sh b/playbooks/roles/healthchecks/files/run_meshpinger.sh new file mode 100644 index 00000000..79a27dcf --- /dev/null +++ b/playbooks/roles/healthchecks/files/run_meshpinger.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +export WRAPPER_BIN="$0" +export WRAPPER_ENV="OCI_HPC_STACK" +date + +eval "$(ssh-agent -s)" >/dev/null ; ssh-add ~/.ssh/id_rsa >/dev/null + +# Run meshpinger +/opt/oci-hpc/healthchecks/meshpinger_bm/run_meshpinger "$@" \ No newline at end of file diff --git a/playbooks/roles/healthchecks/tasks/main.yml b/playbooks/roles/healthchecks/tasks/main.yml index d42e3997..fc839573 100755 --- a/playbooks/roles/healthchecks/tasks/main.yml +++ b/playbooks/roles/healthchecks/tasks/main.yml @@ -19,4 +19,42 @@ - gpu_bw_test.py - rdma_link_flapping.py - xid_checker.py - - shared_logging.py \ No newline at end of file + - shared_logging.py + +- name: Download oci-mesh-pinger + get_url: + url: "{{mp_download_link}}" + dest: "/tmp/" + when: ('controller' in group_names) + +- name: untar meshpinger + unarchive: + src: "/tmp/{{mp_filename}}" + dest: "/opt/oci-hpc/healthchecks" + when: ('controller' in group_names) + +- name: Copy files + become: true + copy: + src: '{{ item }}' + dest: '/opt/oci-hpc/healthchecks/{{ item }}' + force: no + owner: '{{ ansible_user }}' + group: '{{ ansible_user }}' + mode: 0755 + with_items: + - run_meshpinger.sh + when: ('controller' in group_names) + +- name: Make sure meshpinger dependencies are installed. + vars: + package_name: + - fping + - jq + - lshw + - ethtool + package_state: latest + include_role: + name: safe_yum + ignore_errors: true + \ No newline at end of file diff --git a/playbooks/roles/hostname/tasks/el.yml b/playbooks/roles/hostname/tasks/el.yml index 98966fbe..75011f89 100755 --- a/playbooks/roles/hostname/tasks/el.yml +++ b/playbooks/roles/hostname/tasks/el.yml @@ -2,15 +2,11 @@ - name: update hostname for HPC cluster vars: - index: "{{ hostvars[inventory_hostname]['ansible_default_ipv4']['address'] | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) }}" - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" hostname: - name: "{{queue}}-{{keyword}}-node-{{index}}" + name: "{{hostname_convention}}-{{index}}" when: ('compute' in group_names ) - name: Check Hostname - vars: - - index: "{{ hostvars[inventory_hostname]['ansible_default_ipv4']['address'] | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) }}" - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" shell: cmd: "hostname" register: hostname_output @@ -19,7 +15,6 @@ - name: update hostname for HPC cluster vars: - index: "{{ hostvars[inventory_hostname]['ansible_default_ipv4']['address'] | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) }}" - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" hostname: - name: "{{queue}}-{{keyword}}-node-{{index}}" + name: "{{hostname_convention}}-{{index}}" when: ('compute' in group_names ) and ( hostname_output.stdout != ansible_fqdn.split('.')[0] ) diff --git a/playbooks/roles/hostname/tasks/ubuntu.yml b/playbooks/roles/hostname/tasks/ubuntu.yml index 682efa32..f6b13067 100755 --- a/playbooks/roles/hostname/tasks/ubuntu.yml +++ b/playbooks/roles/hostname/tasks/ubuntu.yml @@ -2,7 +2,6 @@ - name: update hostname for HPC cluster vars: - index: "{{ hostvars[inventory_hostname]['ansible_default_ipv4']['address'] | ansible.netcommon.ipsubnet(hostvars[inventory_hostname]['private_subnet']) }}" - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" hostname: - name: "{{queue}}-{{keyword}}-node-{{index}}" + name: "{{hostname_convention}}-{{index}}" when: ('compute' in group_names ) \ No newline at end of file diff --git a/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml b/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml index ef93e456..bb75f359 100644 --- a/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml +++ b/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml @@ -19,10 +19,35 @@ include_role: name: safe_yum +- name: force influxdb gid 997 + become: true + lineinfile: + path: /etc/group + state: present + regexp: '^influxdb:x:(.*)$' + line: 'influxdb:x:997:' + backrefs: yes + +- name: force influxd uid 997 + become: true + lineinfile: + path: /etc/passwd + state: present + regexp: '^influxdb:x:(.*)$' + line: 'influxdb:x:997:997::/var/lib/influxdb:/bin/false' + backrefs: yes + +- name: restart influxdb + become: true + service: + name: influxdb + state: restarted + enabled: yes + - name: install influx pip become: true vars: ansible_python_interpreter: /usr/bin/python3 pip: name: influxdb - executable: pip3 \ No newline at end of file + executable: pip3 diff --git a/playbooks/roles/limits/templates/limits.j2 b/playbooks/roles/limits/templates/limits.j2 index 1a5b5947..efa2874a 100755 --- a/playbooks/roles/limits/templates/limits.j2 +++ b/playbooks/roles/limits/templates/limits.j2 @@ -17,7 +17,7 @@ ####* soft stack 1048576 #### -{% if shape == "BM.GPU.B4.8" or shape == "BM.GPU4.8" or shape == "BM.GPU.A100-v2.8" or shape == "BM.GPU.H100.8" %} +{% if shape == "BM.GPU.B4.8" or shape == "BM.GPU4.8" or shape == "BM.GPU.A100-v2.8" or shape == "BM.GPU.H100.8" or shape == "BM.GPU.MI300X.8" or shape == "BM.GPU.L40S.4"%} * soft nproc 40960 * hard nproc 40960 * soft nofile 20480 diff --git a/playbooks/roles/localdisk/tasks/common.yml b/playbooks/roles/localdisk/tasks/common.yml index 35a5efb4..e1b763cd 100755 --- a/playbooks/roles/localdisk/tasks/common.yml +++ b/playbooks/roles/localdisk/tasks/common.yml @@ -1,89 +1,101 @@ --- -- name: check path - set_fact: +- name: Check NVMe path and names + set_fact: nvme_path_edited: "{% if nvme_path[-1] == '/' %}{{nvme_path[:-1]}}{% else%}{{nvme_path}}{% endif %}" + nvmes: "{{ (hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[1-9][0-9]n1') | list ) + (hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list) }}" + dev_prefix: "/dev/" -- name: Get the number of NVMe's +- name: Prefix Devices set_fact: - nvme_count: "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]|[1-9][0-9]n1') | list | length}}" + nvme_devices: "{{ [dev_prefix] | product(nvmes) | map('join') | list }}" -- name: Create a LVM? +- name: Create Mdadm Raid? set_fact: - one_lv: "{{( log_vol | bool ) and ( ( nvme_count | int ) > 1 )}}" - -- name: Create a new primary partition - parted: - device: "/dev/{{item}}" - number: 1 - state: present - label: gpt - with_items: - - "{{ (hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[1-9][0-9]n1') | list ) + (hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list) }}" - -- name: create a filesystem - filesystem: - dev: "/dev/{{item}}p1" - fstype: xfs - opts: "-L locscratch{{item | replace('nvme','') | replace('n1','')}}" - with_items: - - "{{ (hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[1-9][0-9]n1') | list ) + (hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list) }}" - when: not ( one_lv | bool ) + mdadm_vol: "{{( log_vol | bool ) and ( ( nvmes | count ) > 1 )}}" -- name: Mount local volume - mount: - path: "{% if item | replace('nvme','') | replace('n1','') == '0' %}{{ nvme_path_edited}}{% else%}{{ nvme_path_edited}}{{item | replace('nvme','') | replace('n1','')}}{% endif %}" - src: "LABEL=locscratch{{item | replace('nvme','') | replace('n1','')}}" - fstype: xfs - opts: defaults,noatime - state: mounted - with_items: - - "{{ (hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[1-9][0-9]n1') | list ) + (hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list) }}" - when: not ( one_lv | bool ) -- name: "set permissions on {{ nvme_path_edited }}" - become: true - file: - path: "{% if item | replace('nvme','') | replace('n1','') == '0' %}{{ nvme_path_edited}}{% else%}{{ nvme_path_edited}}{{item | replace('nvme','') | replace('n1','')}}{% endif %}" - state: directory - owner: "{{ ansible_user }}" - mode: 0775 - group: "{{privilege_group_name}}" - recurse: no - with_items: - - "{{ (hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[1-9][0-9]n1') | list ) + (hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list) }}" - when: not ( one_lv | bool ) - -- name: Check for lvm devices - shell: fdisk -l|grep vg_nvmes|wc -l - register: lv_count - -- block: - - name: Create volume group - lvg: - vg: "vg_nvmes" - pvs: "{{['/dev/']|product((hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[1-9][0-9]n1') | list ) + (hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list) )|map('join', '') | join(',')}}" +- name: Partition Per Disk + block: + - name: Create A New Primary Partition + parted: + device: "/dev/{{item}}" + number: 1 + state: present + label: gpt + with_items: + - "{{ nvmes }}" + + - name: Create A Filesystem + filesystem: + dev: "/dev/{{item}}p1" + fstype: xfs + opts: "-L locscratch{{item | replace('nvme','') | replace('n1','')}}" + with_items: + - "{{ nvmes }}" + + - name: Mount Local Volume + mount: + path: "{% if item | replace('nvme','') | replace('n1','') == '0' %}{{ nvme_path_edited}}{% else%}{{ nvme_path_edited}}{{item | replace('nvme','') | replace('n1','')}}{% endif %}" + src: "LABEL=locscratch{{item | replace('nvme','') | replace('n1','')}}" + fstype: xfs + opts: defaults,noatime + state: mounted + with_items: + - "{{ nvmes }}" + + - name: "set permissions on {{ nvme_path_edited }}" + become: true + file: + path: "{% if item | replace('nvme','') | replace('n1','') == '0' %}{{ nvme_path_edited}}{% else%}{{ nvme_path_edited}}{{item | replace('nvme','') | replace('n1','')}}{% endif %}" + state: directory + owner: "{{ ansible_user }}" + mode: 0775 + group: "{{privilege_group_name}}" + recurse: no + with_items: + - "{{ nvmes }}" + + when: not ( mdadm_vol | bool ) - - name: Create Logical volume - lvol: - vg: "vg_nvmes" - lv: "lv_nvmes" - size: 100%FREE - opts: "{% if redundancy | bool %}--type raid10{% else%}{% if ( nvme_count | int ) > 3 %}-i4{% else%}-i2{% endif %}{% endif %}" +- name: Arrays + block: + - name: Checking Status Of Array + shell: "cat /proc/mdstat | grep md0" + register: "array_check" + changed_when: false + failed_when: false + check_mode: no + + - debug: + msg: "{{ array_check.rc }}" - - name: Create file system + # Creating raid arrays + - name: Creating Array + shell: "yes | mdadm --create /dev/md0 {% if redundancy | bool %}--level=10{% else%}--level=0{% endif%} --raid-devices={{ nvmes | count }} {{ nvme_devices | join(' ') }}" + register: "array_created" + when: array_check.rc != 0 + + # Capture the raid array details to append to mdadm.conf + - name: Capturing Array Details + command: "mdadm --detail --scan --verbose" + register: "array_details" + changed_when: false + + - name: Create Filesystem filesystem: fstype: xfs - dev: "/dev/vg_nvmes/lv_nvmes" + dev: "/dev/md0" + opts: -f - - name: Mount local volume + - name: Mount Array mount: path: "{{ nvme_path_edited}}" - src: "/dev/vg_nvmes/lv_nvmes" + src: "/dev/md0" fstype: xfs - opts: defaults,noatime + opts: defaults,nofail,noatime,discard state: mounted - - name: "set permissions on {{ nvme_path_edited }}" + - name: "Set Permissions On {{ nvme_path_edited }}" become: true file: path: "{{ nvme_path_edited}}" @@ -92,10 +104,33 @@ mode: 0775 group: "{{privilege_group_name}}" recurse: no + + - name: Ensure mdadm_conf's directory exists + file: + path: /etc/mdadm + state: directory + + - name: Ensure mdadm_conf file exists + copy: + content: "" + dest: /etc/mdadm/mdadm.conf + force: no + + # Updating mdadm.conf in order to persist between reboots + - name: Updating mdadm_conf + lineinfile: + dest: /etc/mdadm/mdadm.conf + regexp: "^{{ item }}" + line: "{{ item }}" + state: "present" + with_items: '{{ array_details.stdout_lines }}' + when: array_created.changed + + - name: rebuild initramfs if ubuntu + shell: update-initramfs -k all -u + when: ansible_facts['distribution'] == "Ubuntu" and array_created.changed + when: - - one_lv | bool - - lv_count.stdout == '0' + - mdadm_vol | bool + -- name: rebuild initramfs if ubuntu - shell: update-initramfs -k all -u - when: ansible_facts['distribution'] == "Ubuntu" diff --git a/playbooks/roles/metrics-exporter/defaults/main.yml b/playbooks/roles/metrics-exporter/defaults/main.yml new file mode 100644 index 00000000..23880d0a --- /dev/null +++ b/playbooks/roles/metrics-exporter/defaults/main.yml @@ -0,0 +1,2 @@ +node_exporter: 1.8.2 +cpu_ports: 9100 diff --git a/playbooks/roles/metrics-exporter/files/custom_metric_common.py b/playbooks/roles/metrics-exporter/files/custom_metric_common.py new file mode 100644 index 00000000..fc82f8c2 --- /dev/null +++ b/playbooks/roles/metrics-exporter/files/custom_metric_common.py @@ -0,0 +1,189 @@ +import subprocess +import shutil +import shlex +import os +import re +import requests +from shared_logging import logger +import pwd +import grp + +def get_metadata(): + + """ Make a request to metadata endpoint """ + headers = { 'Authorization' : 'Bearer Oracle' } + metadata_url = "http://169.254.169.254/opc/" + metadata_ver = "2" + request_url = metadata_url + "v" + metadata_ver + "/instance/" + + return requests.get(request_url, headers=headers).json() + +def is_user_root(): + + # Check if the user is root + if os.geteuid() != 0: + logger.debug("User is not root") + return False + + # Return true if the user is root + return True + +def get_host_serial(): + + # Run the shell command + if not is_user_root(): + result = subprocess.run(['sudo', 'dmidecode', '-s', 'system-serial-number'], stdout=subprocess.PIPE) + else: + result = subprocess.run(['dmidecode', '-s', 'system-serial-number'], stdout=subprocess.PIPE) + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + + # Validate output + if 'Not Specified' in output: + output = 'None' + elif output == "": + output = 'None' + + # Return the serial number + return output.strip() + +def get_slurm_instance_data(): + + # Dictionary for storing slurm instance configuration + config = {} + + cmd = shlex.split(f'scontrol show node {os.environ["HOSTNAME"]}') + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + decoded_output = result.stdout.decode('utf-8') + + # Search for pattern and get desired values + pattern=r'ActiveFeatures=(\w+.*),(.*)' + match = re.search(pattern, decoded_output) + + # Update config dictionary + config = {'instance_shape': match.group(1)} + config.update({'cluster_name': match.group(2)}) + + return config + +def get_net_devices(): + + cmd = ['ibdev2netdev'] + net_devices = {} + + if shutil.which(cmd[0]) is None: + logger.debug(f'Command \"{cmd[0]}\" does not exist') + else: + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output = result.stdout.decode('utf-8') + + for line in output.split('\n'): + word = line.split() + if len(word) > 0: + net_devices.update({word[0]: word[4]}) + + return net_devices + +def get_rdma_devices(oci_shape): + + rdma_devices = [] + if oci_shape == "BM.GPU.H100.8": + rdma_devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] + elif oci_shape == "BM.GPU.B4.8" or oci_shape == "BM.GPU.A100-v2.8": + rdma_devices = ["mlx5_1", "mlx5_2", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_11", "mlx5_12", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] + elif oci_shape == "BM.GPU4.8": + rdma_devices = ["mlx5_0", "mlx5_1", "mlx5_2", "mlx5_3", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_11", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] + elif oci_shape == "BM.Optimized3.36": + rdma_devices = ["mlx5_2", "mlx5_3"] + + return rdma_devices + +def create_textfile_dir(ne_dir, tf_dir_name, ne_user, ne_group): + + textfile_dir = os.path.join(ne_dir, tf_dir_name) + + # Check if user exists + try: + pwd.getpwnam(ne_user) + except KeyError: + logger.debug(f"User {ne_user} does not exist") + return False + + # Check if group exists + try: + grp.getgrnam(ne_group) + except KeyError: + logger.debug(f"Group {ne_group} does not exist") + return False + + # Check if node exporter directory under which textfile collector directory will be created exists + if not os.path.exists(textfile_dir): + + # Command for creating textfile collector directory + create_dir_cmd = ['mkdir', '-p', textfile_dir] + + # Commands for updating user and group ownership of textfile collector directory + commands = [ + ['chown', '-R', ne_user, textfile_dir], + ['chgrp', '-R', ne_group, textfile_dir], + ] + + if not is_user_root(): + try: + subprocess.run(['sudo', '-S'] + create_dir_cmd, check=True) + except: + logger.debug(f"Unable to create {textfile_dir} directory") + return False + + # Update user and group ownership of directory + for command in commands: + subprocess.run(['sudo', '-S'] + command, check=True) + else: + try: + subprocess.run(create_dir_cmd, check=True) + except: + logger.debug(f"Unable to create {textfile_dir} directory") + return False + + # Update user and group ownership of directory + for command in commands: + subprocess.run(command, check=True) + return textfile_dir + +def copy_metric_file(src_tf_path, dest_tf_path, mfile_owner, mfile_group): + # Check if user exists + try: + pwd.getpwnam(mfile_owner) + except KeyError: + logger.debug(f"User {mfile_owner} does not exist") + + # Check if group exists + try: + grp.getgrnam(mfile_group) + except KeyError: + logger.debug(f"Group {mfile_group} does not exist") + + # Command for updating file user and group ownership + commands = [ + ['chown', '-R', mfile_owner, src_tf_path], + ['chgrp', '-R', mfile_group, src_tf_path], + ] + + # Update user and group ownership of the the text file + if not is_user_root(): + for command in commands: + subprocess.run(['sudo', '-S'] + command, check=True) + else: + for command in commands: + subprocess.run(command, check=True) + + # Move the temporary metrics file to actual destination and overwrite the existing + tf_dir_path = os.path.dirname(dest_tf_path) + if os.path.exists(tf_dir_path): + os.system(f'sudo mv {src_tf_path} {dest_tf_path}') + else: + logger.debug(f'Unable to move {src_tf_path} to {dest_tf_path}') + return False + + return True diff --git a/playbooks/roles/metrics-exporter/files/dcgm-counters.csv b/playbooks/roles/metrics-exporter/files/dcgm-counters.csv new file mode 100644 index 00000000..1f0f9845 --- /dev/null +++ b/playbooks/roles/metrics-exporter/files/dcgm-counters.csv @@ -0,0 +1,78 @@ +# Format +# If line starts with a '#' it is considered a comment +# DCGM FIELD, Prometheus metric type, help message + +# Clocks +DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). +DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). +# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param). + +# Temperature +DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). +DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + +# Power +DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + +# PCIE +DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML. +DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML. +DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + +# Utilization (the sample period varies depending on the product) +DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). +DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). +DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). +DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + +# Errors and violations +DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. +DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). +DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). +DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). +DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). +DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). +DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). +DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param). +# Memory usage +DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB). +DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). + +# ECC +DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. +DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. +DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. +DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + +# Retired pages +# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. +# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. +# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + +# NVLink +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes + +# VGPU License status +DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + +# Remapped rows +DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors +DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors +DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + +# Static configuration information. These appear as labels on the other metrics +DCGM_FI_DRIVER_VERSION, label, Driver Version +DCGM_FI_NVML_VERSION, label, NVML Version +DCGM_FI_DEV_BRAND, label, Device Brand +DCGM_FI_DEV_SERIAL, label, Device Serial Number +# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version +# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version +# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version +# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version +# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device + diff --git a/playbooks/roles/metrics-exporter/files/node_exporter.service b/playbooks/roles/metrics-exporter/files/node_exporter.service new file mode 100644 index 00000000..7376259c --- /dev/null +++ b/playbooks/roles/metrics-exporter/files/node_exporter.service @@ -0,0 +1,11 @@ +[Unit] +Description=Node Exporter +Wants=network-online.target +After=network-online.target + +[Service] +User=prometheus +ExecStart=/usr/bin/node_exporter --collector.nfs --collector.mountstats --collector.systemd --collector.processes --collector.textfile.directory=/var/lib/node_exporter/textfile_collector + +[Install] +WantedBy=default.target diff --git a/playbooks/roles/metrics-exporter/files/nvlink_counters_exporter.py b/playbooks/roles/metrics-exporter/files/nvlink_counters_exporter.py new file mode 100644 index 00000000..22843b24 --- /dev/null +++ b/playbooks/roles/metrics-exporter/files/nvlink_counters_exporter.py @@ -0,0 +1,83 @@ +from prometheus_client import start_http_server, Gauge +import time +import subprocess +from io import StringIO +import json +import re +gpu_data = {} +link_data = {} +gpu_id = None +gpu = None + +data_tx_kib = Gauge('nvlink_data_tx_kib', 'Total data in KiB transmitted', ['hostname', 'gpu', "link"]) +data_rx_kib = Gauge('nvlink_data_rx_kib', 'Total data in KiB received', ['hostname', 'gpu', "link"]) +raw_tx_kib = Gauge('nvlink_raw_tx_kib', 'Total raw bytes in KiB transmitted', ['hostname', 'gpu', "link"]) +raw_rx_kib = Gauge('nvlink_raw_rx_kib', 'Total raw bytes in KiB received', ['hostname', 'gpu', "link"]) + +def get_nvlink_metrics(): + hostname = subprocess.getoutput("hostname") + metrics_raw = StringIO(subprocess.getoutput("/usr/bin/nvidia-smi nvlink -gt rd")) + for line in metrics_raw: + line = line.strip() + if line.startswith("GPU"): + gpu_match = re.match(r"GPU (\d+): (.+?) \(UUID: (.+?)\)", line.strip()) + if gpu_match: + gpu_id = gpu_match.group(1) + gpu_model = gpu_match.group(2) + gpu_uuid = gpu_match.group(3) + gpu = f"GPU_{gpu_id}" + gpu_data[gpu] = { + "model": gpu_model, + "uuid": gpu_uuid + } + link_data[gpu] = {} + elif line.startswith("Link") and gpu is not None: + line = line.strip() + data_tx_match = re.match(r"Link (\d+): Data Tx: (\d+) KiB", line) + data_rx_match = re.match(r"Link (\d+): Data Rx: (\d+) KiB", line) + raw_tx_match = re.match(r"Link (\d+): Raw Tx: (\d+) KiB", line) + raw_rx_match = re.match(r"Link (\d+): Raw Rx: (\d+) KiB", line) + if data_tx_match: + link_id = data_tx_match.group(1) + data_tx = int(data_tx_match.group(2)) + data_tx_kib.labels(hostname=hostname, gpu=gpu_id, link=link_id).set(data_tx) + if link_id in link_data[gpu]: + link_data[gpu][link_id]["data_tx_kib"]=data_tx + else: + link_data[gpu][link_id]={} + link_data[gpu][link_id]["data_tx_kib"]=data_tx + if data_rx_match: + link_id = data_rx_match.group(1) + data_rx = int(data_rx_match.group(2)) + data_rx_kib.labels(hostname=hostname, gpu=gpu_id, link=link_id).set(data_rx) + if link_id in link_data[gpu]: + link_data[gpu][link_id]["data_rx_kib"]=data_rx + else: + link_data[gpu][link_id]={} + link_data[gpu][link_id]["data_rx_kib"]=data_rx + if raw_tx_match: + link_id = raw_tx_match.group(1) + raw_tx = int(raw_tx_match.group(2)) + raw_tx_kib.labels(hostname=hostname, gpu=gpu_id, link=link_id).set(raw_tx) + if link_id in link_data[gpu]: + link_data[gpu][link_id]["raw_tx_kib"]=raw_tx + else: + link_data[gpu][link_id]={} + link_data[gpu][link_id]["raw_tx_kib"]=raw_tx + if raw_rx_match: + link_id = raw_rx_match.group(1) + raw_rx = int(raw_rx_match.group(2)) + raw_rx_kib.labels(hostname=hostname, gpu=gpu_id, link=link_id).set(raw_rx) + if link_id in link_data[gpu]: + link_data[gpu][link_id]["raw_rx_kib"]=raw_rx + else: + link_data[gpu][link_id]={} + link_data[gpu][link_id]["raw_rx_kib"]=raw_rx +if __name__ == '__main__': + # Start up the server to expose the metrics. + start_http_server(9600) + # Generate NVLink metrics every 10 seconds + while True: + get_nvlink_metrics() + time.sleep(10) + diff --git a/playbooks/roles/metrics-exporter/files/rdma_counters_exporter.py b/playbooks/roles/metrics-exporter/files/rdma_counters_exporter.py new file mode 100644 index 00000000..925d864e --- /dev/null +++ b/playbooks/roles/metrics-exporter/files/rdma_counters_exporter.py @@ -0,0 +1,125 @@ +from prometheus_client import start_http_server, Gauge +import time +import subprocess + +# Define Prometheus metrics +# HW Counters for ROCEv2 +np_ecn_marked_roce_packets = Gauge('rdma_np_ecn_marked_roce_packets', 'Number of ROCEv2 packets marked for congestion', ['hostname', 'interface']) +out_of_sequence = Gauge('rdma_out_of_sequence', 'Number of out of sequence packets received.', ['hostname', 'interface']) +packet_seq_err = Gauge('rdma_packet_seq_err', 'Number of received NAK sequence error packets', ['hostname', 'interface']) +local_ack_timeout_err = Gauge('rdma_local_ack_timeout_err', 'Number of times QPs ack timer expired', ['hostname', 'interface']) +roce_adp_retrans = Gauge('rdma_roce_adp_retrans', 'Number of adaptive retransmissions for RoCE traffic', ['hostname', 'interface']) +np_cnp_sent = Gauge('rdma_np_cnp_sent', 'Number of CNP packets sent', ['hostname', 'interface']) +rp_cnp_handled = Gauge('rdma_rp_cnp_handled', 'Number of CNP packets handled to throttle', ['hostname', 'interface']) +rp_cnp_ignored = Gauge('rdma_rp_cnp_ignored', 'Number of CNP packets received and ignored', ['hostname', 'interface']) +rx_icrc_encapsulated = Gauge('rdma_rx_icrc_encapsulated', 'Number of RoCE packets with ICRC (Invertible Cyclic Redundancy Check) errors', ['hostname', 'interface']) +roce_slow_restart = Gauge('rdma_roce_slow_restart', 'Number of times RoCE slow restart was used', ['hostname', 'interface']) +# Port Counters for Infiniband +ib_link_state = Gauge('ib_link_state', 'Port State', ['hostname', 'interface']) +ib_link_phys_state = Gauge('ib_link_phys_state', 'Port Physical State', ['hostname', 'interface']) +ib_symbol_error = Gauge('ib_symbol_error', 'Total number of minor link errors detected on one or more physical lanes', ['hostname', 'interface']) +ib_port_rcv_errors = Gauge('ib_port_rcv_errors', 'Total number of packets containing an error that were received on the port', ['hostname', 'interface']) +ib_port_rcv_remote_phsyical_errors = Gauge('ib_port_rcv_remote_phsyical_errors', 'Total number of packets marked with the EBP delimiter received on the port', ['hostname', 'interface']) +ib_port_rcv_switch_relay_errors = Gauge('ib_port_rcv_switch_relay_errors', 'Total number of packets received on the port that were discarded because they could not be forwarded by the switch relay', ['hostname', 'interface']) +ib_link_error_recovery = Gauge('ib_link_error_recovery', 'Total number of times the Port Training state machine has successfully completed the link error recovery process', ['hostname', 'interface']) +ib_port_xmit_constraint_errors = Gauge('ib_port_xmit_constraint_errors', 'Total number of packets not transmitted from the switch physical port due to outbound raw filtering or failing outbound partition or IP version check', ['hostname', 'interface']) +ib_port_rcv_contraint_errors = Gauge('ib_port_rcv_contraint_errors', 'Total number of packets received on the switch physical port that are discarded due to inbound raw filtering or failing inbound partition or IP version check.', ['hostname', 'interface']) +ib_local_link_integrity_errors = Gauge('ib_local_link_integrity_errors', 'The number of times that the count of local physical errors exceeded the threshold specified by LocalPhyErrors', ['hostname', 'interface']) +ib_excessive_buffer_overrun_errors = Gauge('ib_excessive_buffer_overrun_errors', 'This counter, indicates an input buffer overrun. It indicates possible misconfiguration of a port, either by the Subnet Manager (SM) or by user intervention. It can also indicate hardware issues or extremely poor link signal integrity', ['hostname', 'interface']) +ib_port_xmit_data = Gauge('ib_port_xmit_data', 'Total number of data octets, divided by 4 (lanes), transmitted on all VLs', ['hostname', 'interface']) +ib_port_rcv_data = Gauge('ib_port_rcv_data', 'Total number of data octets, divided by 4 (lanes), received on all VLs', ['hostname', 'interface']) +ib_port_xmit_packets = Gauge('ib_port_xmit_packets', 'Total number of packets transmitted on all VLs from this port. This may include packets with errors', ['hostname', 'interface']) +ib_port_rcv_packets = Gauge('ib_port_rcv_packets', 'Total number of packets received on all VLs from this port. This may include packets with errors', ['hostname', 'interface']) +ib_unicast_rcv_packets = Gauge('ib_unicast_rcv_packets', 'Total number of unicast packets, including unicast packets containing errors', ['hostname', 'interface']) +ib_unicast_xmit_packets = Gauge('ib_unicast_xmit_packets', 'Total number of unicast packets transmitted on all VLs from the port. This may include unicast packets with errors', ['hostname', 'interface']) +ib_multicast_rcv_packets = Gauge('ib_multicast_rcv_packets', 'Total number of multicast packets received on all VLS from the port. This may include multicast packets with errors', ['hostname', 'interface']) +ib_multicast_xmit_packets = Gauge('ib_multicast_xmit_packets', 'Total number of multicast packets transmitted on all VLs from the port. This may include multicast packets with errors', ['hostname', 'interface']) +ib_link_downed = Gauge('ib_link_downed', 'Total number of times the Port Training state machine has failed the link error recovery process and downed the link', ['hostname', 'interface']) +ib_port_xmit_discards = Gauge('ib_port_xmit_discards', 'Total number of outbound packets discarded by the port because the port is down or congested', ['hostname', 'interface']) +ib_VL15_dropped = Gauge('ib_VL15_dropped', 'Number of incoming VL15 packets dropped due to resource limitations', ['hostname', 'interface']) +ib_port_xmit_wait = Gauge('ib_port_xmit_wait', 'The number of ticks during which the port had data to transmit but no data was sent during the entire tick (either because of insufficient credits or because of lack of arbitration)', ['hostname', 'interface']) + +def get_rdma_metrics(): + hostname = subprocess.getoutput("hostname") + rdma_nics = subprocess.getoutput("rdma link show | grep rdma | cut -d ' ' -f2 | sed 's/\/1//g' | tr '\n' ' '").split() + for nic in rdma_nics: + # Hardware counters for ROCEv2 Diagnostics + ecn_packets = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/hw_counters/np_ecn_marked_roce_packets".format(nic=nic))) + out_of_seq = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/hw_counters/out_of_sequence".format(nic=nic))) + seq_err = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/hw_counters/packet_seq_err".format(nic=nic))) + local_ack_timeout = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/hw_counters/local_ack_timeout_err".format(nic=nic))) + adp_retrans = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/hw_counters/roce_adp_retrans".format(nic=nic))) + cnp_sent = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/hw_counters/np_cnp_sent".format(nic=nic))) + cnp_handled = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/hw_counters/rp_cnp_handled".format(nic=nic))) + cnp_ignored = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/hw_counters/rp_cnp_ignored".format(nic=nic))) + icrc_encaps = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/hw_counters/rx_icrc_encapsulated".format(nic=nic))) + slow_restart = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/hw_counters/roce_slow_restart".format(nic=nic))) + + # Port counters for Mellanox Port Diagnostics + link_state = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/state".format(nic=nic)).split(':')[0]) + link_phys_state = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/phys_state".format(nic=nic)).split(':')[0]) + symbol_error = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/symbol_error".format(nic=nic))) + port_rcv_errors = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/port_rcv_errors".format(nic=nic))) + port_rcv_remote_phsyical_errors = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/port_rcv_remote_physical_errors".format(nic=nic))) + port_rcv_switch_relay_errors = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/port_rcv_switch_relay_errors".format(nic=nic))) + link_error_recovery = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/link_error_recovery".format(nic=nic))) + port_xmit_constraint_errors = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/port_xmit_constraint_errors".format(nic=nic))) + port_rcv_contraint_errors = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/port_rcv_constraint_errors".format(nic=nic))) + local_link_integrity_errors = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/local_link_integrity_errors".format(nic=nic))) + excessive_buffer_overrun_errors = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/excessive_buffer_overrun_errors".format(nic=nic))) + port_xmit_data = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/port_xmit_data".format(nic=nic))) + port_rcv_data = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/port_rcv_data".format(nic=nic))) + port_xmit_packets = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/port_xmit_packets".format(nic=nic))) + port_rcv_packets = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/port_rcv_packets".format(nic=nic))) + unicast_rcv_packets = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/unicast_rcv_packets".format(nic=nic))) + unicast_xmit_packets = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/unicast_xmit_packets".format(nic=nic))) + multicast_rcv_packets = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/multicast_rcv_packets".format(nic=nic))) + multicast_xmit_packets = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/multicast_xmit_packets".format(nic=nic))) + link_downed = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/link_downed".format(nic=nic))) + port_xmit_discards = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/port_xmit_discards".format(nic=nic))) + VL15_dropped = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/VL15_dropped".format(nic=nic))) + port_xmit_wait = int(subprocess.getoutput("cat /sys/class/infiniband/{nic}/ports/1/counters/port_xmit_wait".format(nic=nic))) + + # Publish metrics + np_ecn_marked_roce_packets.labels(hostname=hostname, interface=nic).set(ecn_packets) + out_of_sequence.labels(hostname=hostname, interface=nic).set(out_of_seq) + packet_seq_err.labels(hostname=hostname, interface=nic).set(seq_err) + local_ack_timeout_err.labels(hostname=hostname, interface=nic).set(local_ack_timeout) + roce_adp_retrans.labels(hostname=hostname, interface=nic).set(adp_retrans) + np_cnp_sent.labels(hostname=hostname, interface=nic).set(cnp_sent) + rp_cnp_handled.labels(hostname=hostname, interface=nic).set(cnp_handled) + rp_cnp_ignored.labels(hostname=hostname, interface=nic).set(cnp_ignored) + rx_icrc_encapsulated.labels(hostname=hostname, interface=nic).set(icrc_encaps) + roce_slow_restart.labels(hostname=hostname, interface=nic).set(slow_restart) + + ib_link_state.labels(hostname=hostname, interface=nic).set(link_state) + ib_link_phys_state.labels(hostname=hostname, interface=nic).set(link_phys_state) + ib_symbol_error.labels(hostname=hostname, interface=nic).set(symbol_error) + ib_port_rcv_errors.labels(hostname=hostname, interface=nic).set(port_rcv_errors) + ib_port_rcv_remote_phsyical_errors.labels(hostname=hostname, interface=nic).set(port_rcv_remote_phsyical_errors) + ib_port_rcv_switch_relay_errors.labels(hostname=hostname, interface=nic).set(port_rcv_switch_relay_errors) + ib_link_error_recovery.labels(hostname=hostname, interface=nic).set(link_error_recovery) + ib_port_xmit_constraint_errors.labels(hostname=hostname, interface=nic).set(port_xmit_constraint_errors) + ib_port_rcv_contraint_errors.labels(hostname=hostname, interface=nic).set(port_rcv_contraint_errors) + ib_local_link_integrity_errors.labels(hostname=hostname, interface=nic).set(local_link_integrity_errors) + ib_excessive_buffer_overrun_errors.labels(hostname=hostname, interface=nic).set(excessive_buffer_overrun_errors) + ib_port_xmit_data.labels(hostname=hostname, interface=nic).set(port_xmit_data) + ib_port_rcv_data.labels(hostname=hostname, interface=nic).set(port_rcv_data) + ib_port_xmit_packets.labels(hostname=hostname, interface=nic).set(port_xmit_packets) + ib_port_rcv_packets.labels(hostname=hostname, interface=nic).set(port_rcv_packets) + ib_unicast_rcv_packets.labels(hostname=hostname, interface=nic).set(unicast_rcv_packets) + ib_unicast_xmit_packets.labels(hostname=hostname, interface=nic).set(unicast_xmit_packets) + ib_multicast_rcv_packets.labels(hostname=hostname, interface=nic).set(multicast_rcv_packets) + ib_multicast_xmit_packets.labels(hostname=hostname, interface=nic).set(multicast_xmit_packets) + ib_link_downed.labels(hostname=hostname, interface=nic).set(link_downed) + ib_port_xmit_discards.labels(hostname=hostname, interface=nic).set(port_xmit_discards) + ib_VL15_dropped.labels(hostname=hostname, interface=nic).set(VL15_dropped) + ib_port_xmit_wait.labels(hostname=hostname, interface=nic).set(port_xmit_wait) + +if __name__ == '__main__': + # Start up the server to expose the metrics. + start_http_server(9500) + # Generate RDMA metrics every 10 seconds + while True: + get_rdma_metrics() + time.sleep(10) diff --git a/playbooks/roles/metrics-exporter/files/rdma_link_flapping.py b/playbooks/roles/metrics-exporter/files/rdma_link_flapping.py new file mode 100644 index 00000000..2b4b8e8a --- /dev/null +++ b/playbooks/roles/metrics-exporter/files/rdma_link_flapping.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +import os +import sys +import time +import datetime +import re +import argparse +import socket +import subprocess +from shared_logging import logger + + +class LinkFlappingTest: + def __init__(self, time_interval=6): + self.results = None + self.time_interval = int(time_interval) + self.link_data = None + + + # Check if the log file exists + msg_file = "/var/log/messages" + if not os.path.exists(msg_file): + msg_file = "/var/log/syslog" + self.log_file = msg_file + + def get_rdma_link_failures(self): + + pattern = r"(\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})\s+\S+\s+wpa_supplicant(?:\[\d+\])?: (\w+): CTRL-EVENT-EAP-FAILURE EAP authentication failed" + pattern2 = r"(\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})\s+\S+\s+kernel: (?:\[\d+\.\d+\]\s)?mlx5_core \S+ (\w+): Link down" + + self.link_data = {} + with open(self.log_file, "r") as f: + for line in f: + match = re.search(pattern, line) + if match: + time_str = match.group(1) + interface = match.group(2) + logger.debug(f"time: {time_str}, interface: {interface}") + if interface not in self.link_data: + self.link_data[interface] = {"failures": [time_str], "link_down": []} + else: + self.link_data[interface]["failures"].append(time_str) + + + match = re.search(pattern2, line) + if match: + time_str = match.group(1) + interface = match.group(2) + logger.debug(f"time: {time_str}, interface: {interface}") + if interface not in self.link_data: + self.link_data[interface] = {"failures": [], "link_down": [time_str]} + else: + self.link_data[interface]["link_down"].append(time_str) + + logger.debug("Link Data: {}".format(self.link_data)) + return self.link_data + + def process_rdma_link_flapping(self): + + link_issues = {"failures": [], "link_down": []} + + # Get the time stamp when the host came up + bootup_time = subprocess.run(['uptime', '-s'], stdout=subprocess.PIPE) + bootup_time = bootup_time.stdout.decode('utf-8').strip() + bootup_time_str = datetime.datetime.strptime(bootup_time, "%Y-%m-%d %H:%M:%S") + bootup_time_sec = int(time.mktime(bootup_time_str.timetuple())) + bootup_time_grace_period = bootup_time_sec + 1800 + + status = 0 + if len(self.link_data) >= 0: + current_date = datetime.datetime.now() + current_date_str = current_date.strftime("%Y-%b-%d %H:%M:%S") + current_date_sec = int(time.mktime(datetime.datetime.strptime(current_date_str, "%Y-%b-%d %H:%M:%S").timetuple())) + + link_failures = False + for interface in self.link_data: + if len(self.link_data[interface]["failures"]) > 0: + link_failures = True + logger.debug(f"{interface}: {len(self.link_data[interface]['failures'])} RDMA link failure entries in {self.log_file}") + logger.debug(f"{interface}: {self.link_data[interface]['failures']}") + last_date_failure_str = None + + if len(self.link_data[interface]["failures"]) > 0: + last_date_failure_str = self.link_data[interface]["failures"][-1] + last_date_failure = datetime.datetime.strptime(last_date_failure_str, "%b %d %H:%M:%S") + + # Compare the month of the last failure date with the current month + if last_date_failure.month > current_date.month: + # If the last failure month is greater than the current month, subtract one from the current year + last_date_failure = last_date_failure.replace(year=current_date.year - 1) + else: + # Otherwise, set the year of the last failure date to the current year + last_date_failure = last_date_failure.replace(year=current_date.year) + + # Convert the last failure date to seconds since the epoch + last_date_failure_sec = int(time.mktime(last_date_failure.timetuple())) + + if last_date_failure_str != None and last_date_failure_str != current_date_str: + diff_secs = current_date_sec - last_date_failure_sec + diff_hours = diff_secs // (60 * 60) + logger.debug(f"RDMA link ({interface}) failed {diff_hours} hours ago") + + logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}") + if diff_hours < self.time_interval and last_date_failure_sec > bootup_time_grace_period: + logger.debug(f"{interface}: one or more RDMA link flapping events within {self.time_interval} hours. Last flapping event: {last_date_failure_str})") + link_issues["failures"].append(f"{interface}: {len(self.link_data[interface]['failures'])}") + status = -1 + + for interface in self.link_data: + if len(self.link_data[interface]["link_down"]) > 0: + logger.debug(f"{interface}: {len(self.link_data[interface]['link_down'])} RDMA link down entries in {self.log_file}") + logger.debug(f"{interface}: {self.link_data[interface]['link_down']}") + last_date_down_str = None + + if len(self.link_data[interface]["link_down"]) > 0: + last_date_down_str = self.link_data[interface]["link_down"][-1] + last_date_down = datetime.datetime.strptime(last_date_down_str, "%b %d %H:%M:%S") + + # Compare the month of the last failure date with the current month + if last_date_down.month > current_date.month: + # If the last failure month is greater than the current month, subtract one from the current year + last_date_down = last_date_down.replace(year=current_date.year - 1) + else: + # Otherwise, set the year of the last failure date to the current year + last_date_down = last_date_down.replace(year=current_date.year) + + # Convert the last failure date to seconds since the epoch + last_date_down_sec = int(time.mktime(last_date_down.timetuple())) + + + if last_date_down_str != None and last_date_down_str != current_date_str: + diff_secs = current_date_sec - last_date_down_sec + diff_hours = diff_secs // (60 * 60) + logger.debug(f"RDMA link ({interface}) down {diff_hours} hours ago") + + logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}") + if diff_hours < self.time_interval and last_date_down_sec > bootup_time_grace_period: + logger.debug(f"{interface}, one or more RDMA link down events within {self.time_interval} hours. Last link down event: {last_date_down_str}") + link_issues["link_down"].append(f"{interface}: {len(self.link_data[interface]['link_down'])}") + status = -2 + if status == -1: + logger.debug(f"One or more RDMA link flapping events within the past {self.time_interval} hours") + if status == -2: + logger.debug(f"One or more RDMA link down events within the past {self.time_interval} hours") + + else: + logger.info("No RDMA link failures entry in /var/log/messages") + if status == 0: + logger.info("RDMA link flapping/down test: Passed") + else: + logger.warning("RDMA link flapping/down test: Failed") + return link_issues + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description="Process RDMA link flapping data") + parser.add_argument("-l", "--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Set the logging level") + args = parser.parse_args() + + logger.setLevel(args.log_level) + + auth_failure_file = "/tmp/last_auth_failure_date" + msg_file = "/var/log/messages" + if not os.path.exists(msg_file): + msg_file = "/var/log/syslog" + time_interval_hours = 6 + lft = LinkFlappingTest(time_interval=time_interval_hours) + link_data = lft.get_rdma_link_failures() + lft.process_rdma_link_flapping() diff --git a/playbooks/roles/metrics-exporter/files/shared_logging.py b/playbooks/roles/metrics-exporter/files/shared_logging.py new file mode 100644 index 00000000..086e0eaf --- /dev/null +++ b/playbooks/roles/metrics-exporter/files/shared_logging.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 + +import logging +logging.basicConfig(level="INFO", format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger('nem') diff --git a/playbooks/roles/metrics-exporter/files/xid_checker.py b/playbooks/roles/metrics-exporter/files/xid_checker.py new file mode 100644 index 00000000..3c46f36a --- /dev/null +++ b/playbooks/roles/metrics-exporter/files/xid_checker.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 + +import argparse +from shared_logging import logger +import subprocess +import sys +import re +import os + +class XidChecker: + def __init__(self, dmesg_cmd="dmesg", time_interval=60): + # if user is root + if not os.geteuid() == 0: + logger.info("The XidChecker script did not run since it must be run as root") + sys.exit(1) + self.dmesg_cmd = dmesg_cmd + self.results = {} + + + # Check for the following GPU Xid errors in dmesg + self.XID_EC = { + "1": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, + "2": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, + "3": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, + "4": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, + "5": {"description": "Unused", "severity": "Critical"}, + "6": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, + "7": {"description": "Invalid or corrupted push buffer address", "severity": "Critical"}, + "8": {"description": "GPU stopped processing", "severity": "Critical"}, + "9": {"description": "Driver error programming GPU", "severity": "Critical"}, + "10": {"description": "Unused", "severity": "Critical"}, + "11": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, + "12": {"description": "Driver error handling GPU exception", "severity": "Critical"}, + "13": {"description": "Graphics Engine Exception", "severity": "Critical"}, + "14": {"description": "Unused", "severity": "Warn"}, + "15": {"description": "Unused", "severity": "Warn"}, + "16": {"description": "Display engine hung", "severity": "Warn"}, + "17": {"description": "Unused", "severity": "Warn"}, + "18": {"description": "Bus mastering disabled in PCI Config Space", "severity": "Warn"}, + "19": {"description": "Display Engine error", "severity": "Warn"}, + "20": {"description": "Invalid or corrupted Mpeg push buffer", "severity": "Warn"}, + "21": {"description": "Invalid or corrupted Motion Estimation push buffer", "severity": "Warn"}, + "22": {"description": "Invalid or corrupted Video Processor push buffer", "severity": "Warn"}, + "23": {"description": "Unused", "severity": "Warn"}, + "24": {"description": "GPU semaphore timeout", "severity": "Warn"}, + "25": {"description": "Invalid or illegal push buffer stream", "severity": "Warn"}, + "26": {"description": "Framebuffer timeout", "severity": "Warn"}, + "27": {"description": "Video processor exception", "severity": "Warn"}, + "28": {"description": "Video processor exception", "severity": "Warn"}, + "29": {"description": "Video processor exception", "severity": "Warn"}, + "30": {"description": "GPU semaphore access error", "severity": "Warn"}, + "31": {"description": "GPU memory page fault", "severity": "Critical"}, + "32": {"description": "Invalid or corrupted push buffer stream", "severity": "Warn"}, + "33": {"description": "Internal micro-controller error", "severity": "Warn"}, + "34": {"description": "Video processor exception", "severity": "Warn"}, + "35": {"description": "Video processor exception", "severity": "Warn"}, + "36": {"description": "Video processor exception", "severity": "Warn"}, + "37": {"description": "Driver firmware error", "severity": "Warn"}, + "38": {"description": "Driver firmware error", "severity": "Warn"}, + "39": {"description": "Unused", "severity": "Warn"}, + "40": {"description": "Unused", "severity": "Warn"}, + "41": {"description": "Unused", "severity": "Warn"}, + "42": {"description": "Video processor exception", "severity": "Warn"}, + "43": {"description": "GPU stopped processing", "severity": "Warn"}, + "44": {"description": "Graphics Engine fault during context switch", "severity": "Warn"}, + "45": {"description": "Preemptive cleanup, due to previous errors -- Most likely to see when running multiple cuda applications and hitting a DBE", "severity": "Warn"}, + "46": {"description": "GPU stopped processing", "severity": "Warn"}, + "47": {"description": "Video processor exception", "severity": "Warn"}, + "48": {"description": "Double Bit ECC Error", "severity": "Critical"}, + "49": {"description": "Unused", "severity": "Warn"}, + "50": {"description": "Unused", "severity": "Warn"}, + "51": {"description": "Unused", "severity": "Warn"}, + "52": {"description": "Unused", "severity": "Warn"}, + "53": {"description": "Unused", "severity": "Warn"}, + "54": {"description": "Auxiliary power is not connected to the GPU board", "severity": "Warn"}, + "55": {"description": "Unused", "severity": "Warn"}, + "56": {"description": "Display Engine error", "severity": "Critical"}, + "57": {"description": "Error programming video memory interface", "severity": "Critical"}, + "58": {"description": "Unstable video memory interface detected", "severity": "Critical"}, + "59": {"description": "Internal micro-controller error (older drivers)", "severity": "Warn"}, + "60": {"description": "Video processor exception", "severity": "Warn"}, + "61": {"description": "Internal micro-controller breakpoint/warning (newer drivers)", "severity": "Warn"}, + "62": {"description": "Internal micro-controller halt", "severity": "Critical"}, + "63": {"description": "ECC page retirement or row remapping recording event", "severity": "Critical"}, + "64": {"description": "ECC page retirement or row remapper recording failure", "severity": "Critical"}, + "65": {"description": "Video processor exception", "severity": "Critical"}, + "66": {"description": "Illegal access by driver", "severity": "Warn"}, + "67": {"description": "Illegal access by driver", "severity": "Warn"}, + "68": {"description": "NVDEC0 Exception", "severity": "Critical"}, + "69": {"description": "Graphics Engine class error", "severity": "Critical"}, + "70": {"description": "CE3: Unknown Error", "severity": "Warn"}, + "71": {"description": "CE4: Unknown Error", "severity": "Warn"}, + "72": {"description": "CE5: Unknown Error", "severity": "Warn"}, + "73": {"description": "NVENC2 Error", "severity": "Critical"}, + "74": {"description": "NVLINK Error", "severity": "Critical"}, + "75": {"description": "CE6: Unknown Error", "severity": "Warn"}, + "76": {"description": "CE7: Unknown Error", "severity": "Warn"}, + "77": {"description": "CE8: Unknown Error", "severity": "Warn"}, + "78": {"description": "vGPU Start Error", "severity": "Warn"}, + "79": {"description": "GPU has fallen off the bus", "severity": "Critical"}, + "80": {"description": "Corrupted data sent to GPU", "severity": "Critical"}, + "81": {"description": "VGA Subsystem Error", "severity": "Critical"}, + "82": {"description": "NVJPGO Error", "severity": "Warn"}, + "83": {"description": "NVDEC1 Error", "severity": "Warn"}, + "84": {"description": "NVDEC2 Error", "severity": "Warn"}, + "85": {"description": "CE9: Unknown Error", "severity": "Warn"}, + "86": {"description": "OFA Exception", "severity": "Warn"}, + "87": {"description": "Reserved", "severity": "Warn"}, + "88": {"description": "NVDEC3 Error", "severity": "Warn"}, + "89": {"description": "NVDEC4 Error", "severity": "Warn"}, + "90": {"description": "Reserved", "severity": "Warn"}, + "91": {"description": "Reserved", "severity": "Warn"}, + "92": {"description": "High single-bit ECC error rate", "severity": "Critical"}, + "93": {"description": "Non-fatal violation of provisioned InfoROM wear limit", "severity": "Warn"}, + "94": {"description": "Contained ECC error", "severity": "Critical"}, + "95": {"description": "Uncontained ECC error", "severity": "Critical"}, + "96": {"description": "NVDEC5 Error", "severity": "Warn"}, + "97": {"description": "NVDEC6 Error", "severity": "Warn"}, + "98": {"description": "NVDEC7 Error", "severity": "Warn"}, + "99": {"description": "NVJPG1 Error", "severity": "Warn"}, + "100": {"description": "NVJPG2 Error", "severity": "Warn"}, + "101": {"description": "NVJPG3 Error", "severity": "Warn"}, + "102": {"description": "NVJPG4 Error", "severity": "Warn"}, + "103": {"description": "NVJPG5 Error", "severity": "Warn"}, + "104": {"description": "NVJPG6 Error", "severity": "Warn"}, + "105": {"description": "NVJPG7 Error", "severity": "Warn"}, + "106": {"description": "SMBPBI Test Message", "severity": "Warn"}, + "107": {"description": "SMBPBI Test Message Silent", "severity": "Warn"}, + "108": {"description": "Reserved", "severity": "Warn"}, + "109": {"description": "Context Switch Timeout Error", "severity": "Critical"}, + "110": {"description": "Security Fault Error", "severity": "Warn"}, + "111": {"description": "Display Bundle Error Event", "severity": "Warn"}, + "112": {"description": "Display Supervisor Error", "severity": "Warn"}, + "113": {"description": "DP Link Training Error", "severity": "Warn"}, + "114": {"description": "Display Pipeline Underflow Error", "severity": "Warn"}, + "115": {"description": "Display Core Channel Error", "severity": "Warn"}, + "116": {"description": "Display Window Channel Error", "severity": "Warn"}, + "117": {"description": "Display Cursor Channel Error", "severity": "Warn"}, + "118": {"description": "Display Pixel Pipeline Error", "severity": "Warn"}, + "119": {"description": "GSP RPC Timeout", "severity": "Critical"}, + "120": {"description": "GSP Error", "severity": "Critical"}, + "121": {"description": "C2C Link Error", "severity": "Critical"}, + "122": {"description": "SPI PMU RPC Read Failure", "severity": "Warn"}, + "123": {"description": "SPI PMU RPC Write Failure", "severity": "Warn"}, + "124": {"description": "SPI PMU RPC Erase Failure", "severity": "Warn"}, + "125": {"description": "Inforom FS Failure", "severity": "Warn"}, + "126": {"description": "Reserved", "severity": "Warn"}, + "127": {"description": "Reserved", "severity": "Warn"}, + "128": {"description": "Reserved", "severity": "Warn"}, + "129": {"description": "Reserved", "severity": "Warn"}, + "130": {"description": "Reserved", "severity": "Warn"}, + "131": {"description": "Reserved", "severity": "Warn"}, + "132": {"description": "Reserved", "severity": "Warn"}, + "133": {"description": "Reserved", "severity": "Warn"}, + "134": {"description": "Reserved", "severity": "Warn"}, + "135": {"description": "Reserved", "severity": "Warn"}, + "136": {"description": "Reserved", "severity": "Warn"}, + "137": {"description": "Reserved", "severity": "Warn"}, + "138": {"description": "Reserved", "severity": "Warn"}, + "139": {"description": "Reserved", "severity": "Warn"}, + "140": {"description": "Unrecovered ECC Error", "severity": "Warn"}, + "141": {"description": "Reserved", "severity": "Warn"}, + "142": {"description": "Reserved", "severity": "Warn"}, + "143": {"description": "GPU Initialization Failure", "severity": "Warn"} + } + + def check_gpu_xid(self): + status = "Pass" + dmesg_output = subprocess.check_output([self.dmesg_cmd]).decode("utf-8") + if "NVRM: Xid" in dmesg_output: + for XID in self.XID_EC.keys(): + logger.debug(f"Checking for GPU Xid {XID} error in dmesg") + + matches = re.findall(f"NVRM: Xid \(PCI:(.*?): {XID},", dmesg_output) + tmp_dict = {} + for match in matches: + if match not in tmp_dict: + tmp_dict[match] = 1 + else: + tmp_dict[match] = tmp_dict[match] + 1 + for x in tmp_dict.keys(): + logger.info(f"{XID} : count: {tmp_dict[x]}, {self.XID_EC[XID]['description']} - PCI: {x}") + if not matches: + logger.debug(f"No GPU Xid {XID} error found in dmesg") + if tmp_dict != {}: + if self.XID_EC[XID]['severity'] == "Critical": + status = "Failed" + self.results[XID] = {"results": tmp_dict, "description": self.XID_EC[XID]['description']} + else: + logger.info("Xid Check: Passed") + return {"status": status, "results": self.results} + + +if __name__ == '__main__': + # Argument parsing + parser = argparse.ArgumentParser(description='Check for GPU Xid errors.') + parser.add_argument('--dmesg_cmd', default='dmesg', help='Dmesg file to check. Default is dmesg.') + args = parser.parse_args() + + + logger.debug(f"Using dmesg command: {args.dmesg_cmd}") + + xc = XidChecker(dmesg_cmd=args.dmesg_cmd) + results = xc.check_gpu_xid() + logger.debug("Status: {}, Results: {}".format(results["status"], results["results"])) diff --git a/playbooks/roles/metrics-exporter/tasks/custom_metrics.yml b/playbooks/roles/metrics-exporter/tasks/custom_metrics.yml new file mode 100644 index 00000000..fdcf70ef --- /dev/null +++ b/playbooks/roles/metrics-exporter/tasks/custom_metrics.yml @@ -0,0 +1,59 @@ +--- +# tasks file for custom metrics +- name: Create a folder under /opt/oci-hpc for storing custom metric scripts + become: true + vars: + ne_script_dir: /opt/oci-hpc/nodeexporter/scripts + file: + path: "{{ ne_script_dir }}" + state: directory + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: '0775' + recurse: yes + +- name: Copying custom metrics main script + become: true + vars: + ne_script_dir: /opt/oci-hpc/nodeexporter/scripts + template: + src: templates/custom_metrics.py.j2 + dest: "{{ ne_script_dir }}/custom_metrics.py" + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: '0775' + +- name: Copying additional custom metric scripts + become: true + vars: + ne_script_dir: /opt/oci-hpc/nodeexporter/scripts + copy: + src: "files/{{ item }}" + dest: "{{ ne_script_dir }}" + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: '0775' + with_items: + - custom_metric_common.py + - shared_logging.py + - rdma_link_flapping.py + - xid_checker.py + +- name: Copying custom metric service file + become: true + vars: + ne_script_dir: /opt/oci-hpc/nodeexporter/scripts + template: + src: templates/customMetrics.service.j2 + dest: /etc/systemd/system/customMetrics.service + owner: root + group: root + mode: '0755' + +- name: Enable and Start Custom Metric Service + become: true + service: + name=customMetrics.service + state=started + enabled=yes + diff --git a/playbooks/roles/metrics-exporter/tasks/dcgm_exporter.yml b/playbooks/roles/metrics-exporter/tasks/dcgm_exporter.yml new file mode 100644 index 00000000..47bcd8bb --- /dev/null +++ b/playbooks/roles/metrics-exporter/tasks/dcgm_exporter.yml @@ -0,0 +1,82 @@ +--- +- name: Install dependencies + become: true + package: + name: "{{ dependencies }}" + state: present + +- name: Download Go tarball + become: true + get_url: + url: "{{ go_url }}" + dest: "/tmp/{{ go_tarball }}" + +- name: Remove old Go installation + become: true + file: + path: "{{ go_install_dir }}/go" + state: absent + +- name: Extract Go tarball + become: true + unarchive: + src: "/tmp/{{ go_tarball }}" + dest: "{{ go_install_dir }}" + remote_src: true + +- name: Ensure Go binary path is in PATH + become: true + lineinfile: + path: /etc/profile.d/go.sh + line: "export PATH=$PATH:{{ go_path }}" + create: yes + state: present + +- name: Clone DCGM Exporter repository + become: true + git: + repo: "{{ dcgm_repo }}" + dest: "/tmp/dcgm-exporter" + update: no + environment: + PATH: "{{ go_path }}:{{ ansible_env.PATH }}" + +- name: Build DCGM Exporter binary + become: true + command: make binary + args: + chdir: /tmp/dcgm-exporter + environment: + PATH: "{{ go_path }}:{{ ansible_env.PATH }}" + +- name: Install DCGM Exporter + become: true + command: make install + args: + chdir: /tmp/dcgm-exporter + environment: + PATH: "{{ go_path }}:{{ ansible_env.PATH }}" + +- name: Create systemd service for DCGM-Exporter + become: true + template: + src: dcgm-exporter.service.j2 + dest: "{{ dcgm_service_path }}" + +- name: Copy DCGM-Exporter counter configuration + become: true + copy: + src: files/dcgm-counters.csv + dest: "{{ dcgm_counter_config_path }}" + +- name: Reload systemd daemon + become: true + systemd: + daemon_reload: yes + +- name: Enable and start DCGM-Exporter service + become: true + systemd: + name: dcgm-exporter + enabled: yes + state: started diff --git a/playbooks/roles/metrics-exporter/tasks/main.yml b/playbooks/roles/metrics-exporter/tasks/main.yml new file mode 100644 index 00000000..fa3cad5c --- /dev/null +++ b/playbooks/roles/metrics-exporter/tasks/main.yml @@ -0,0 +1,17 @@ +- include_tasks: node_exporter_el.yml + when: ansible_os_family == 'RedHat' + +- include_tasks: node_exporter_ubuntu.yml + when: ansible_distribution == 'Ubuntu' + +- include_tasks: dcgm_exporter.yml + when: ('compute' in group_names) and 'GPU' in shape + +- include_tasks: rdma_exporter.yml + when: ('compute' in group_names) and cluster_network|bool + +- include_tasks: nvlink_exporter.yml + when: ('compute' in group_names) and 'GPU' in shape + +- include_tasks: custom_metrics.yml + when: ('compute' in group_names) and cluster_network|bool diff --git a/playbooks/roles/metrics-exporter/tasks/node_exporter_el.yml b/playbooks/roles/metrics-exporter/tasks/node_exporter_el.yml new file mode 100644 index 00000000..2bab4e4c --- /dev/null +++ b/playbooks/roles/metrics-exporter/tasks/node_exporter_el.yml @@ -0,0 +1,81 @@ +--- + +- name: Create user for prometheus + become: true + user: + name: "{{ prometheus_user }}" + state: present + createhome: no + shell: /usr/sbin/nologin + append: yes + +- name: Create /var/lib/prometheus/node_exporter directory + become: true + file: + path: "/var/lib/prometheus/node_exporter" + state: directory + owner: 'prometheus' + group: 'prometheus' + +- name: Download node_exporter {{ node_exporter }} + become: true + get_url: + url: https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter }}/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz + dest: /tmp/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz + +- name: Extract node_exporter {{ node_exporter }}.linux-amd64.tar.gz into /var/lib/prometheus/node_exporter/ + become: true + unarchive: + src: /tmp/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz + dest: /var/lib/prometheus/node_exporter/ + extra_opts: [--strip-components=1] + remote_src: true + +- name: Recursively change ownership of a /var/lib/prometheus/node_exporter/ + become: true + file: + path: /var/lib/prometheus/node_exporter + state: directory + recurse: yes + owner: prometheus + group: prometheus + +- name: Create a symbolic link node_exporter + become: true + file: + src: /var/lib/prometheus/node_exporter/node_exporter + dest: /usr/bin/node_exporter + state: link + +- name: Configure node_exporter service + become: true + copy: + src: 'node_exporter.service' + dest: '/usr/lib/systemd/system/node_exporter.service' + + +- name: Run command deactivate selinux for node_exporter, chcon + become: true + command: chcon --reference=/bin/less /usr/bin/node_exporter + +- name: Make sure the python setuptools are installed + vars: + package_name: + - python3-setuptools + package_state: latest + include_role: + name: safe_yum + when: ansible_os_family == 'RedHat' + +- name: Run command deactivate selinux for node_exporter, semanage + become: true + command: semanage fcontext -a -t bin_t "/usr/bin/node_exporter" + register: node_exporter + failed_when: "node_exporter.rc != 0 and 'already defined' not in node_exporter.stderr" + +- name: start node_exporter.service + become: true + service: + name: node_exporter.service + state: restarted + enabled: true diff --git a/playbooks/roles/metrics-exporter/tasks/node_exporter_ubuntu.yml b/playbooks/roles/metrics-exporter/tasks/node_exporter_ubuntu.yml new file mode 100644 index 00000000..0a4146e3 --- /dev/null +++ b/playbooks/roles/metrics-exporter/tasks/node_exporter_ubuntu.yml @@ -0,0 +1,66 @@ +--- +- name: Create prometheus group on compute + become: true + group: + name: prometheus + state: present + system: yes + +- name: Create prometheus user on compute + become: true + user: + name: prometheus + comment: "prometheus user" + group: prometheus + createhome: no # Optional: depending on if you want to create a home directory + +- name: Create /var/lib/prometheus/node_exporter directory + become: true + file: + path: "/var/lib/prometheus/node_exporter" + state: directory + owner: 'prometheus' + group: 'prometheus' + +- name: Download node_exporter {{ node_exporter }} + become: true + get_url: + url: "https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter }}/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz" + dest: "/tmp/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz" + +- name: Extract node_exporter {{ node_exporter }}.linux-amd64.tar.gz into /var/lib/prometheus/node_exporter/ + become: true + unarchive: + src: "/tmp/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz" + dest: "/var/lib/prometheus/node_exporter/" + extra_opts: ["--strip-components=1"] + remote_src: true + +- name: Recursively change ownership of /var/lib/prometheus/node_exporter/ + become: true + file: + path: /var/lib/prometheus/node_exporter + state: directory + recurse: yes + owner: prometheus + group: prometheus + +- name: Create a symbolic link for node_exporter + become: true + file: + src: /var/lib/prometheus/node_exporter/node_exporter + dest: /usr/bin/node_exporter + state: link + +- name: Configure node_exporter service + become: true + copy: + src: 'node_exporter.service' + dest: '/etc/systemd/system/node_exporter.service' + +- name: Start and enable node_exporter service + become: true + systemd: + name: node_exporter.service + state: restarted + enabled: true diff --git a/playbooks/roles/metrics-exporter/tasks/nvlink_exporter.yml b/playbooks/roles/metrics-exporter/tasks/nvlink_exporter.yml new file mode 100644 index 00000000..d915b79a --- /dev/null +++ b/playbooks/roles/metrics-exporter/tasks/nvlink_exporter.yml @@ -0,0 +1,31 @@ +--- +- name: Install prometheus_client python package + ansible.builtin.pip: + name: prometheus_client + executable: /usr/bin/pip3 + become: yes + +- name: Copy service file to scripts directory + copy: + src: nvlink_counters_exporter.py + dest: /usr/local/bin + mode: 0755 + become: yes + +- name: Render systemd service file + become: true + template: + src: nvlink-exporter.service.j2 + dest: /etc/systemd/system/nvlink-exporter.service + force: yes + backup: yes + owner: prometheus + group: prometheus + mode: 0744 + +- name: Restart nvlink exporter + become: true + service: + name: nvlink-exporter + state: restarted + enabled: yes diff --git a/playbooks/roles/metrics-exporter/tasks/rdma_exporter.yml b/playbooks/roles/metrics-exporter/tasks/rdma_exporter.yml new file mode 100644 index 00000000..7e93d3f6 --- /dev/null +++ b/playbooks/roles/metrics-exporter/tasks/rdma_exporter.yml @@ -0,0 +1,31 @@ +--- +- name: Install prometheus_client python package + ansible.builtin.pip: + name: prometheus_client + executable: /usr/bin/pip3 + become: yes + +- name: Copy service file to scripts directory + copy: + src: rdma_counters_exporter.py + dest: /usr/local/bin + mode: 0755 + become: yes + +- name: Render systemd service file + become: true + template: + src: rdma-exporter.service.j2 + dest: /etc/systemd/system/rdma-exporter.service + force: yes + backup: yes + owner: prometheus + group: prometheus + mode: 0755 + +- name: Restart rdma exporter + become: true + service: + name: rdma-exporter + state: restarted + enabled: yes diff --git a/playbooks/roles/metrics-exporter/templates/customMetrics.service.j2 b/playbooks/roles/metrics-exporter/templates/customMetrics.service.j2 new file mode 100644 index 00000000..2a8c8f9f --- /dev/null +++ b/playbooks/roles/metrics-exporter/templates/customMetrics.service.j2 @@ -0,0 +1,13 @@ +[Unit] +Description=Captures custom metrics for node exporter +Wants=network-online.target +After=network-online.target + +[Service] +User={{ ansible_user }} +Group={{ ansible_user }} +Type=simple +ExecStart={{ ne_script_dir }}/custom_metrics.py + +[Install] +WantedBy=multi-user.target diff --git a/playbooks/roles/metrics-exporter/templates/custom_metrics.py.j2 b/playbooks/roles/metrics-exporter/templates/custom_metrics.py.j2 new file mode 100644 index 00000000..039845d1 --- /dev/null +++ b/playbooks/roles/metrics-exporter/templates/custom_metrics.py.j2 @@ -0,0 +1,698 @@ +#!/usr/bin/env python3 + +import subprocess +import shutil +import shlex +import os +import re +import requests +from shared_logging import logger +import sys +from custom_metric_common import * +import platform +import time +from rdma_link_flapping import LinkFlappingTest + +def get_host_serial(): + # Run the shell command + if not is_user_root(): + result = subprocess.run(['sudo', 'dmidecode', '-s', 'system-serial-number'], stdout=subprocess.PIPE) + else: + result = subprocess.run(['dmidecode', '-s', 'system-serial-number'], stdout=subprocess.PIPE) + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + + # Return the serial number + return output.strip() + +def oca_version_metric(min_version): + + metadata = get_metadata() + oci_name = metadata['displayName'] + # Run the shell command + os_name = platform.system() + + if os_name == 'Linux': + try: + distro = platform.linux_distribution()[0] + except: + import distro + distro = distro.name() + + if 'Ubuntu' in distro: + if not is_user_root(): + result = subprocess.run(['sudo', 'snap', 'info', 'oracle-cloud-agent'], stdout=subprocess.PIPE) + else: + result = subprocess.run(['snap', 'info', 'oracle-cloud-agent'], stdout=subprocess.PIPE) + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + + # Define the regular expression pattern for the version + pattern = r'installed:\s+(\d+\.\d+\.\d+)' + match = re.search(pattern, output) + if match: + version = match.group(1) + + elif 'Oracle' in distro: + result = subprocess.run(['rpm', '-qa'], stdout=subprocess.PIPE) + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + + # Define the regular expression pattern for the version + pattern = r'oracle-cloud-agent-(\d+\.\d+\.\d+)' + match = re.search(pattern, output) + if match: + version = match.group(1) + + # Textfile name for metrics + tf_name = 'oca_version.prom' + tf_path = os.path.join(textfile_dir_path, tf_name) + + # Get current process id and create a temporary textfile + process_pid = os.getpid() + tmp_tf_path = os.path.join('/tmp', tf_name) + "." + str(process_pid) + + # Metric value + if version > min_version: + oca_ver_metric_value = "1" + else: + oca_ver_metric_value = "0" + + # Write RDMA Interconnect Status metric file + with open(tmp_tf_path, "w") as tmp_tf: + help_text = "# HELP oca_version Version of OCA installed on host" + type_text = "# TYPE oca_version gauge" + tmp_tf.write('{}\n{}\n'.format(help_text, type_text)) + tmp_tf.close() + + with open(tmp_tf_path, "a") as tmp_tf: + tmp_tf.write('\n') + metric_text = "oca_version{version=" + "\"" + version + "\"" + "," \ + + "hostname=" + "\"" + oci_name + "\"" + "} " \ + + str(oca_ver_metric_value) \ + + "\n" + tmp_tf.write('{}'.format(metric_text)) + tmp_tf.close() + + copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group) + + return True + +def rdma_link_metric(textfile_dir_path, node_exporter_user, node_exporter_group): + status = True + metadata=get_metadata() + oci_shape=metadata['shape'] + devices = get_rdma_devices(oci_shape) + link_issues = [] + interconnect_status = {} + + for device in devices: + # Run the mlxlink command + if not is_user_root(): + command = ['sudo', 'mlxlink', '-d', device, '-m', '-c', '-e'] + else: + command = ['mlxlink', '-d', device, '-m', '-c', '-e'] + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + stderr = result.stderr.decode('utf-8') + + if stderr and stderr.find("-E-") != -1: + stderr = stderr.split("\n") + stderr_line = ", ".join(stderr) + logger.debug(f"{device}: {stderr_line}") + link_issues.append(f"{device}: {stderr[0]}") + status = "False" + continue + + # Find the line containing "Recommendation" + color_pattern = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') + link_state = re.search(r'\nState.*', output).group().split(":")[1].strip() + recommendation = re.search(r'Recommendation.*', output).group().split(":")[1].strip() + physical_BER = re.search(r'Raw Physical BER.*', output).group().split(":")[1].strip() + + # Remove hidden characters from the output + link_state = re.sub(color_pattern, '', link_state) + recommendation = re.sub(color_pattern, '', recommendation) + + # Update interconnect_status dictionary with interconnect name and status + if link_state != "Active": + status = False + interconnect_status.update({device : 0}) + elif not "No issue was observed" in recommendation: + if "Bad signal integrity" in recommendation and float(physical_BER) > 1e-07: + interconnect_status.update({device : 0}) + status = False + else: + interconnect_status.update({device : 1}) + + # Get RDMA and Network Devices + rdma_netdev_map = get_net_devices() + + # Textfile name for metrics + tf_name = 'rdma_link_status.prom' + tf_path = os.path.join(textfile_dir_path, tf_name) + + # Get current process id and create a temporary textfile + process_pid = os.getpid() + tmp_tf_path = os.path.join('/tmp', tf_name) + "." + str(process_pid) + + # Write RDMA Interconnect Status metric file + with open(tmp_tf_path, "w") as tmp_tf: + help_text = "# HELP rdma_device_status Current status of all RDMA network interfaces" + type_text = "# TYPE rdma_device_status gauge" + tmp_tf.write('{}\n{}\n'.format(help_text, type_text)) + tmp_tf.close() + + with open(tmp_tf_path, "a") as tmp_tf: + tmp_tf.write('\n') + for rdma_dev in interconnect_status.keys(): + metric_text = "rdma_device_status{rdma_device=" + "\"" + rdma_dev + "\"" + "," \ + + "net_device=" + "\"" + rdma_netdev_map[rdma_dev] + "\"" + "} " \ + + str(interconnect_status[rdma_dev]) \ + + "\n" + tmp_tf.write('{}'.format(metric_text)) + tmp_tf.close() + + copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group) + + return True + +def rttcc_status_metric(): + + metadata=get_metadata() + oci_shape=metadata['shape'] + + devices = get_rdma_devices(oci_shape) + link_status = {} + status_dict = {"devices": {}} + + for device in devices: + if not is_user_root(): + command = ['sudo', 'mlxreg', '-d', device, '-y', '--get', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0'] + else: + command = ['mlxreg', '-d', device, '-y', '--set', 'cmd_type=3', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0'] + + result = subprocess.run(command, stdout=subprocess.PIPE,stderr=subprocess.PIPE) + output = result.stdout.decode('utf-8') + + filtered_output = [line for line in output.split('\n') if line.startswith('value')] + for line in filtered_output: + if "0x00000001" in line: + status_dict["devices"][device] = "enabled" + + status_dict["devices"][device] = "disabled" + + for device in status_dict["devices"]: + if status_dict["devices"][device] == "enabled": + link_status.update({device: 1}) + else: + link_status.update({device: 0}) + + # Get RDMA and Network Devices + rdma_netdev_map = get_net_devices() + + # Textfile name for metrics + tf_name = 'rdma_rttcc_status.prom' + tf_path = os.path.join(textfile_dir_path, tf_name) + + # Get current process id and create a temporary textfile + process_pid = os.getpid() + tmp_tf_path = os.path.join('/tmp', tf_name) + "." + str(process_pid) + + # Write RDMA Interconnect Status metric file + with open(tmp_tf_path, "w") as tmp_tf: + help_text = "# HELP rttcc_status Status of RTTCC" + type_text = "# TYPE rttcc_status gauge" + tmp_tf.write('{}\n{}\n'.format(help_text, type_text)) + tmp_tf.close() + + with open(tmp_tf_path, "a") as tmp_tf: + tmp_tf.write('\n') + for rdma_dev in link_status.keys(): + if rdma_dev in rdma_netdev_map.keys(): + metric_text = "rttcc_status{rdma_device=" + "\"" + rdma_dev + "\"" + "," \ + + "net_device=" + "\"" + rdma_netdev_map[rdma_dev] + "\"" + "} " \ + + str(link_status[rdma_dev]) \ + + "\n" + tmp_tf.write('{}'.format(metric_text)) + else: + print("rdma dev not found in rdma_netdev_map:" + rdma_dev) + print(rdma_netdev_map.keys()) + tmp_tf.close() + + copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group) + + return True + +def check_bus_metric(): + + metadata=get_metadata() + oci_shape=metadata['shape'] + + # Check to see if any devices have fallen of the bus + command = ['lspci', '-v'] + result = subprocess.run(command, stdout=subprocess.PIPE) + output = result.stdout.decode('utf-8') + lines = output.split('\n') + bus_issues = [] + + for line in lines: + if line.find('(rev ff)') != -1: + bus_issues.append(line) + if len(bus_issues) == 0: + logger.info(f"No devices have fallen off the bus") + checkbus_metric = len(bus_issues) + elif len(bus_issues) > 0: + logger.error("Devices have fallen off the bus") + checkbus_metric = len(bus_issues) + else: + logger.warning("Bus Check Test: Failed") + checkbus_metric = len(bus_issues) + + # Textfile name for metrics + tf_name = 'check_bus.prom' + tf_path = os.path.join(textfile_dir_path, tf_name) + + # Get current process id and create a temporary textfile + process_pid = os.getpid() + tmp_tf_path = os.path.join('/tmp', tf_name) + "." + str(process_pid) + + # Write RDMA Interconnect Status metric file + with open(tmp_tf_path, "w") as tmp_tf: + help_text = "# HELP check_bus Check if devices have fallen off the bus " + type_text = "# TYPE check_bus gauge" + tmp_tf.write('{}\n{}\n'.format(help_text, type_text)) + tmp_tf.close() + + with open(tmp_tf_path, "a") as tmp_tf: + tmp_tf.write('\n') + metric_text = "check_bus_issue_count " + str(checkbus_metric) \ + + "\n" + tmp_tf.write('{}'.format(metric_text)) + tmp_tf.close() + + copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group) + + return True + +def rdma_link_flap_check(): + + metadata=get_metadata() + oci_shape=metadata['shape'] + + # Check for RDMA link flapping + try: + lf_interval = "6" + lft = LinkFlappingTest(time_interval=lf_interval) + lft.get_rdma_link_failures() + lft_issues = lft.process_rdma_link_flapping() + except Exception as e: + logger.warning(f"Failed to check RDMA link flapping with error: {e}") + lft_issues = {"failures": [], "link_down": []} + + # Get RDMA and Network Devices + rdma_netdev_map = get_net_devices() + + # Textfile name for metrics + tf_name = 'rdma_link_flap.prom' + tf_path = os.path.join(textfile_dir_path, tf_name) + + # Get current process id and create a temporary textfile + process_pid = os.getpid() + tmp_tf_path = os.path.join('/tmp', tf_name) + "." + str(process_pid) + + # List of unhealthy net devices + unhealthy_net_devices = [] + + if len(lft_issues["failures"]) > 0 or len(lft_issues["link_down"]) > 0: + if len(lft_issues["failures"]) > 0: + for issue in lft_issues["failures"]: + unhealthy_dev = issue.split(":")[0] + if unhealthy_dev not in unhealthy_net_devices: + unhealthy_net_devices.append(unhealthy_dev) + + if len(lft_issues["link_down"]) > 0: + for issue in lft_issues["link_down"]: + unhealthy_dev = issue.split(":")[0] + if unhealthy_dev not in unhealthy_net_devices: + unhealthy_net_devices.append(unhealthy_dev) + + # Write RDMA Link Flapping metric file + with open(tmp_tf_path, "w") as tmp_tf: + help_text = "# HELP rdma_link_flap Check if RDMA devices are flaping " + type_text = "# TYPE rdma_link_flap gauge" + tmp_tf.write('{}\n{}\n'.format(help_text, type_text)) + tmp_tf.close() + + with open(tmp_tf_path, "a") as tmp_tf: + tmp_tf.write('\n') + for rdma_dev in rdma_netdev_map.keys(): + lft_error = "" + if "rdma" in rdma_netdev_map[rdma_dev]: + if rdma_netdev_map[rdma_dev] in unhealthy_net_devices: + lft_error = "0" + metric_text = "rdma_link_noflap{rdma_device=" + "\"" + rdma_dev + "\"" + "," \ + + "net_device=" + "\"" + rdma_netdev_map[rdma_dev] + "\"" + "} " \ + + str(lft_error) \ + + "\n" + tmp_tf.write('{}'.format(metric_text)) + else: + lft_error = "1" + metric_text = "rdma_link_noflap{rdma_device=" + "\"" + rdma_dev + "\"" + "," \ + + "net_device=" + "\"" + rdma_netdev_map[rdma_dev] + "\"" + "} " \ + + str(lft_error) \ + + "\n" + tmp_tf.write('{}'.format(metric_text)) + tmp_tf.close() + + copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group) + + return True + +def check_nvidia_gpu_count(): + + # get metadata + metadata = get_metadata() + oci_shape = metadata['shape'] + + # GPUs allocated available per shape + if oci_shape == "BM.GPU.H100.8": + alloc_gpu_count = 8 + elif oci_shape == "BM.GPU.B4.8": + alloc_gpu_count = 8 + elif oci_shape == "BM.GPU.A100-v2.8": + alloc_gpu_count = 8 + elif oci_shape == "BM.GPU4.8": + alloc_gpu_count = 8 + elif oci_shape == "BM.GPU.A10.4": + alloc_gpu_count = 4 + elif oci_shape == "VM.GPU.A10.1": + alloc_gpu_count = 1 + elif oci_shape == "VM.GPU.A10.2": + alloc_gpu_count = 2 + + # Command to list all available GPUs + command = ['nvidia-smi', '--list-gpus'] + result = subprocess.run(command, stdout=subprocess.PIPE) + output = result.stdout.decode('utf-8') + output_to_lines = output.split('\n') + # remove empty lines and total entries + lines = [line for line in output_to_lines if line] + avail_gpu_count = len(lines) + + # Check if available count is equal to allocated + gpu_metric = "" + if avail_gpu_count == alloc_gpu_count: + gpu_metric = 1 + else: + gpu_metric = 0 + + # Textfile name for metrics + tf_name = 'gpu_count_metric.prom' + tf_path = os.path.join(textfile_dir_path, tf_name) + + # Get current process id and create a temporary textfile + process_pid = os.getpid() + tmp_tf_path = os.path.join('/tmp', tf_name) + "." + str(process_pid) + + # Write GPU count metric file + with open(tmp_tf_path, "w") as tmp_tf: + help_text = "# HELP gpu_count Count of GPUs" + type_text = "# TYPE gpu_count gauge" + tmp_tf.write('{}\n{}\n'.format(help_text, type_text)) + tmp_tf.close() + + with open(tmp_tf_path, "a") as tmp_tf: + tmp_tf.write('\n') + metric_text = "gpu_count{instance_shape=" + "\"" + oci_shape + "\"" + "} " \ + + str(gpu_metric) \ + + "\n" + tmp_tf.write('{}'.format(metric_text)) + tmp_tf.close() + + copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group) + + return True + +def check_ecc_errors(): + ecc_issues = [] + try: + # Run the nvidia-smi -q command + result = subprocess.run(['nvidia-smi', '-q'], stdout=subprocess.PIPE) + except FileNotFoundError: + logger.warning("Skipping SRAM/DRAM ECC Test: nvidia-smi command not found") + return [] + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + + # Find the lines containing "SRAM Correctable" and "DRAM Correctable" + sram_matches = re.findall(r'SRAM Uncorrectable\s+:\s+(\d+)', output) + if len(sram_matches)==0: + sram_matches = re.findall(r'SRAM Uncorrectable Parity\s+:\s+(\d+)', output) + dram_matches = re.findall(r'DRAM Uncorrectable\s+:\s+(\d+)', output) + gpu_matches = re.findall(r'\nGPU\s+(.*)\n', output) + vol_sram_line = sram_matches[0::2] + vol_dram_line = dram_matches[0::2] + agg_sram_line = sram_matches[1::2] + agg_dram_line = dram_matches[1::2] + + for i, gpu in enumerate(gpu_matches): + logger.debug(f"GPU: {gpu}") + if vol_sram_line[i] != "0": + logger.debug(f"Volatile SRAM Uncorrectable: {vol_sram_line[i]}") + ecc_issues.append(f"{gpu_matches[i]} - Volatile SRAM Uncorrectable: {vol_sram_line[i]}") + if vol_dram_line[i] != "0": + logger.debug(f"Volatile DRAM Uncorrectable: {vol_dram_line[i]}") + ecc_issues.append(f"{gpu_matches[i]} - Volatile DRAM Uncorrectable: {vol_dram_line[i]}") + if agg_sram_line[i] != "0": + logger.debug(f"Aggregate SRAM Uncorrectable: {agg_sram_line[i]}") + ecc_issues.append(f"{gpu_matches[i]} - Aggregate SRAM Uncorrectable: {agg_sram_line[i]}") + if agg_dram_line[i] != "0": + logger.debug(f"Aggregate DRAM Uncorrectable: {agg_dram_line[i]}") + ecc_issues.append(f"{gpu_matches[i]} - Aggregate DRAM Uncorrectable: {agg_dram_line[i]}") + + # Textfile name for metrics + tf_name = 'gpu_ecc_error_check.prom' + tf_path = os.path.join(textfile_dir_path, tf_name) + + # Get current process id and create a temporary textfile + process_pid = os.getpid() + tmp_tf_path = os.path.join('/tmp', tf_name) + "." + str(process_pid) + + # Write ECC Error Check metric file + with open(tmp_tf_path, "w") as tmp_tf: + help_text = "# HELP gpu_ecc_error_check Pass or Fail based on row remap errors found in a GPU" + type_text = "# TYPE gpu_ecc_error_check gauge" + tmp_tf.write('{}\n{}\n'.format(help_text, type_text)) + # Check if there are ecc_issues + if len(ecc_issues) == 0: + metric_text = "gpu_ecc_error_check 1" + "\n" + print(metric_text) + tmp_tf.write('{}'.format(metric_text)) + else: + ecc_error=False + for issue in ecc_issues: + if "Skipped" in issue: + logger.warning(f"{host_serial} - {issue}") + else: + if "Aggregate" in issue: + logger.warning(f"{host_serial} - ECC issues: {issue}") + else: + logger.error(f"{host_serial} - ECC issues: {issue}") + ecc_error=True + if ecc_error: + metric_text = "gpu_ecc_error_check 0" + "\n" + print(metric_text) + tmp_tf.write('{}'.format(metric_text)) + tmp_tf.close() + + copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group) + + return True + +def check_row_remap_errors(): + remap_issues = [] + try: + # Run the nvidia-smi -q command + result = subprocess.run(['nvidia-smi', '--query-remapped-rows=remapped_rows.pending,remapped_rows.failure,remapped_rows.uncorrectable', '--format=csv,noheader'], stdout=subprocess.PIPE) + + if result.returncode != 0: + logger.debug(f"Check row remap command exited with error code: {result.returncode}") + + except FileNotFoundError: + logger.warning("Skipping Row Remap Test: nvidia-smi command not found") + return [] + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + logger.debug("Output: {}".format(output)) + for i, line in enumerate(output.split('\n')): + if line == "": + continue + tmp_data = line.split(",") + tmp_data = [x.strip() for x in tmp_data] + if tmp_data[0] != "0": + logger.debug(f"GPU: {i} - Row Remap Pending: {tmp_data[0]}") + remap_issues.append(f"GPU: {i} Row Remap Pending: {tmp_data[0]}") + if tmp_data[1] != "0": + logger.debug(f"GPU: {i} - Row Remap Failure: {tmp_data[1]}") + #remap_issues.append(f"GPU: {i} Row Remap Failure: {tmp_data[1]}") + if tmp_data[2] != "0": + logger.debug(f"GPU: {i} - Row Remap Uncorrectable: {tmp_data[2]}") + if int(tmp_data[2]) > 512: + remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable >512: {tmp_data[2]}") + else: + remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable <512: {tmp_data[2]}")# Check if there are ecc_issues + + # Textfile name for metrics + tf_name = 'gpu_row_remap_error_check.prom' + tf_path = os.path.join(textfile_dir_path, tf_name) + + # Get current process id and create a temporary textfile + process_pid = os.getpid() + tmp_tf_path = os.path.join('/tmp', tf_name) + "." + str(process_pid) + + # Write Row Remap Error Check metric file + with open(tmp_tf_path, "w") as tmp_tf: + help_text = "# HELP gpu_row_remap_error_check Pass or Fail based on row remap errors found in a GPU" + type_text = "# TYPE gpu_row_remap_error_check gauge" + tmp_tf.write('{}\n{}\n'.format(help_text, type_text)) + + remap_error=False + if len(remap_issues) > 0: + for issue in remap_issues: + if "<512" in issue: + logger.warning(f"{host_serial} - {issue}") + else: + logger.error(f"{host_serial} - {issue}") + remap_error=True + if remap_error: + metric_text = "gpu_row_remap_error_check 1" + "\n" + print(metric_text) + tmp_tf.write('{}'.format(metric_text)) + else: + metric_text = "gpu_row_remap_error_check 0" + "\n" + print(metric_text) + tmp_tf.write('{}'.format(metric_text)) + else: + metric_text = "gpu_row_remap_error_check 0" + "\n" + print(metric_text) + tmp_tf.write('{}'.format(metric_text)) + tmp_tf.close() + + copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group) + + return True + +def xid_check(): + result = subprocess.run(['sudo', 'python3', '/opt/oci-hpc/nodeexporter/scripts/xid_checker.py'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output = result.stderr.decode('utf-8') + + # Textfile name for metrics + tf_name = 'xid_error_check.prom' + tf_path = os.path.join(textfile_dir_path, tf_name) + + # Get current process id and create a temporary textfile + process_pid = os.getpid() + tmp_tf_path = os.path.join('/tmp', tf_name) + "." + str(process_pid) + + # Write Xid Error Check metric file + with open(tmp_tf_path, "w") as tmp_tf: + help_text = "# HELP xid_error_check Pass or Fail based on xid errors thrown by GPU on a PCI Device " + type_text = "# TYPE xid_error_check gauge" + tmp_tf.write('{}\n{}\n'.format(help_text, type_text)) + if output.find("Passed") > 0: + metric_text = "xid_error_check 1" + "\n" + print(metric_text) + tmp_tf.write('{}'.format(metric_text)) + else: + metric_text = "xid_error_check 0" + "\n" + print(metric_text) + tmp_tf.write('{}'.format(metric_text)) + tmp_tf.close() + + copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group) + + return True + + +if __name__ == '__main__': + + # User and group under which node expoerter service is being run + node_exporter_user = "{{ prometheus_user }}" + node_exporter_group = "{{ prometheus_user }}" + + # Check if user exists + try: + pwd.getpwnam(node_exporter_user) + except KeyError: + logger.debug(f"User {node_exporter_user} does not exist") + + # Check if group exists + try: + grp.getgrnam(node_exporter_group) + except KeyError: + logger.debug(f"Group {node_exporter_group} does not exist") + + # Prometheus and textfile directory path and names + node_exporter_root_dir = "/var/lib/node_exporter" + textfile_dir_name = "textfile_collector" + + # Textfile directory path for saving updated metric files + textfile_dir_path = os.path.join(node_exporter_root_dir, textfile_dir_name) + + # Create textfile directory path if it doesn't exist + if not os.path.exists(textfile_dir_path): + create_textfile_dir(node_exporter_root_dir, textfile_dir_name, node_exporter_user, node_exporter_group) + + # Get list of RDMA devices + metadata=get_metadata() + oci_shape=metadata['shape'] + rdma_devices = get_rdma_devices(oci_shape) + host_serial = "" + try: + host_serial = get_host_serial() + except Exception as e: + logger.warning(f"Failed to get host serial number with error: {e}") + host_serial = "Unknown" + + # Run the checks every 10 minutes + while True: + # Check if ibdev2netdev command exists and update status of RDMA interconnects + if not rdma_devices: + logger.debug(f'Shape does not support RDMA') + else: + # Check status of all RDMA links + rdma_link_metric(textfile_dir_path, node_exporter_user, node_exporter_group) + # Check and update status of RTTCC + rttcc_status_metric() + # Check flapping RDMA links + rdma_link_flap_check() + + # Check and update information about version of OCA installed + min_required_oca_version = "1.39.0" + oca_version_metric(min_required_oca_version) + + # Check if devices have fallen off the bus + check_bus_metric() + + # Check if Xid check Passed + xid_check() + + # Check if nvidia-smi command exists and run health checks + try: + subprocess.call(["nvidia-smi"]) + check_nvidia_gpu_count() + check_ecc_errors() + check_row_remap_errors() + + except FileNotFoundError: + logger.debug(f'Shape does not support nvidia-smi command') + + time.sleep(600) diff --git a/playbooks/roles/metrics-exporter/templates/dcgm-exporter.service.j2 b/playbooks/roles/metrics-exporter/templates/dcgm-exporter.service.j2 new file mode 100644 index 00000000..7fd17e4c --- /dev/null +++ b/playbooks/roles/metrics-exporter/templates/dcgm-exporter.service.j2 @@ -0,0 +1,12 @@ +[Unit] +Description=NVIDIA DCGM Exporter +After=network.target + +[Service] +ExecStart=/usr/bin/dcgm-exporter -f /etc/dcgm-counters.csv +StandardOutput=syslog +StandardError=syslog +SyslogIdentifier=dcgm-exporter + +[Install] +WantedBy=multi-user.target diff --git a/playbooks/roles/metrics-exporter/templates/nvlink-exporter.service.j2 b/playbooks/roles/metrics-exporter/templates/nvlink-exporter.service.j2 new file mode 100644 index 00000000..e9009150 --- /dev/null +++ b/playbooks/roles/metrics-exporter/templates/nvlink-exporter.service.j2 @@ -0,0 +1,13 @@ +[Unit] +Description=NVLink bandwidth tracker for all GPUs +Wants=network-online.target +After=network-online.target + +[Service] +User={{ prometheus_user }} +Group={{ prometheus_user }} +Type=simple +ExecStart=/usr/bin/env python3 /usr/local/bin/nvlink_counters_exporter.py + +[Install] +WantedBy=multi-user.target diff --git a/playbooks/roles/metrics-exporter/templates/prometheus.yml.j2 b/playbooks/roles/metrics-exporter/templates/prometheus.yml.j2 new file mode 100644 index 00000000..d8829e81 --- /dev/null +++ b/playbooks/roles/metrics-exporter/templates/prometheus.yml.j2 @@ -0,0 +1,54 @@ + +{% set cpulist=[] %} +{% set gpulist=[] %} +{% for i in range(sinfo.stdout.split()[2:]| length) %} + +{% if loop.index is even %} +{% set ind= loop.index-2 %} +{% if 'GPU' in sinfo.stdout.split()[2:][i]%} +{{ gpulist.append( '"' ~ sinfo.stdout.split()[2:][ind] ~ ':' ~ gpu_ports ~ '"') }} +{% endif %} +{{ cpulist.append( '"' ~ sinfo.stdout.split()[2:][ind] ~ ':' ~ cpu_ports ~ '"') }} + + +{% endif %} +{% endfor %} + + + +# my global config +global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + # - "first_rules.yml" + # - "second_rules.yml" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: "CPU Metrics" + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: [{{ cpulist|join(",") }}] + - job_name: "GPU Metrics" + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: [{{ gpulist|join(",") }}] \ No newline at end of file diff --git a/playbooks/roles/metrics-exporter/templates/rdma-exporter.service.j2 b/playbooks/roles/metrics-exporter/templates/rdma-exporter.service.j2 new file mode 100644 index 00000000..69c62d80 --- /dev/null +++ b/playbooks/roles/metrics-exporter/templates/rdma-exporter.service.j2 @@ -0,0 +1,13 @@ +[Unit] +Description=RDMA Hardware counter exporter for RDMA NICs +Wants=network-online.target +After=network-online.target + +[Service] +User={{ prometheus_user }} +Group={{ prometheus_user }} +Type=simple +ExecStart=/usr/bin/env python3 /usr/local/bin/rdma_counters_exporter.py + +[Install] +WantedBy=multi-user.target diff --git a/playbooks/roles/metrics-exporter/vars/main.yml b/playbooks/roles/metrics-exporter/vars/main.yml new file mode 100644 index 00000000..00de20ad --- /dev/null +++ b/playbooks/roles/metrics-exporter/vars/main.yml @@ -0,0 +1,24 @@ +--- +# Variables +gpu_ports: 9400 +prometheus_user: prometheus + +go_version: "1.23.0" +go_tarball: "go{{ go_version }}.linux-amd64.tar.gz" +go_url: "https://go.dev/dl/{{ go_tarball }}" +go_install_dir: "/usr/local" +go_path: "/usr/local/go/bin" +dcgm_repo: "https://github.com/NVIDIA/dcgm-exporter.git" +dcgm_service_path: "/etc/systemd/system/dcgm-exporter.service" +dcgm_counter_config_path: "/etc" +dependencies: "{{ rhel_dependencies if ansible_os_family == 'RedHat' else ubuntu_dependencies }}" +ubuntu_dependencies: + - wget + - git + - make + - build-essential +rhel_dependencies: + - wget + - git + - make + - gcc diff --git a/playbooks/roles/oci-cloud-agent-updater/tasks/el.yml b/playbooks/roles/oci-cloud-agent-updater/tasks/el.yml new file mode 100644 index 00000000..08efc291 --- /dev/null +++ b/playbooks/roles/oci-cloud-agent-updater/tasks/el.yml @@ -0,0 +1,36 @@ +--- +- name: Install yum-plugin-versionlock for OL8 + yum: + name: python3-dnf-plugin-versionlock + state: latest + disable_gpg_check: yes + when: + - ansible_os_family == 'RedHat' and ansible_distribution_major_version == '8' + +- name: Hold latest OCA for OL + community.general.yum_versionlock: + name: oracle-cloud-agent + state: present + when: + - ansible_os_family == 'RedHat' + +- name: Check if the updater file exists + stat: + path: /etc/oracle-cloud-agent/updater.yml + register: updater_exist + +- name: Disable OCA updater + replace: + path: /etc/oracle-cloud-agent/updater.yml + regexp: 'upgrade_interval: 3600' + replace: 'upgrade_interval: -1' + when: updater_exist.stat.exists + +- name: Restart cloud agent updater + service: + name: oracle-cloud-agent-updater + state: restarted + retries: 5 + register: restart_cloud_agent_updater + until: restart_cloud_agent_updater is not failed + when: updater_exist.stat.exists diff --git a/playbooks/roles/oci-cloud-agent-updater/tasks/main.yml b/playbooks/roles/oci-cloud-agent-updater/tasks/main.yml new file mode 100644 index 00000000..de4b8936 --- /dev/null +++ b/playbooks/roles/oci-cloud-agent-updater/tasks/main.yml @@ -0,0 +1,4 @@ +- include_tasks: el.yml + when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' +- include_tasks: ubuntu.yml + when: ansible_os_family == 'Debian' \ No newline at end of file diff --git a/playbooks/roles/oci-cloud-agent-updater/tasks/ubuntu.yml b/playbooks/roles/oci-cloud-agent-updater/tasks/ubuntu.yml new file mode 100644 index 00000000..38f73459 --- /dev/null +++ b/playbooks/roles/oci-cloud-agent-updater/tasks/ubuntu.yml @@ -0,0 +1,33 @@ +--- +- name: Hold Oracle cloud-agent + shell: "sudo snap refresh --hold=forever oracle-cloud-agent" + when : ansible_distribution == 'Ubuntu' + ignore_errors: yes + +- name: Check if the updater file exists + stat: + path: /etc/oracle-cloud-agent/updater.yml + register: updater_exist + +- name: Disable OCA updater + replace: + path: /etc/oracle-cloud-agent/updater.yml + regexp: 'upgrade_interval: 3600' + replace: 'upgrade_interval: -1' + when: updater_exist.stat.exists + + +- name: Restart cloud agent updater + service: + name: snap.oracle-cloud-agent.oracle-cloud-agent-updater.service + state: restarted + retries: 5 + register: restart_cloud_agent_updater + until: restart_cloud_agent_updater is not failed + when: updater_exist.stat.exists + +# oracle-cloud-agent runs by default on ubuntu +# However it doesn't have osms (OS Management Service) which we had to disable on Oracle Linux +# sudo snap list | grep oracle-cloud-agent +# to check for version +# snap info oracle-cloud-agent \ No newline at end of file diff --git a/playbooks/roles/oci-cloud-agent/tasks/el.yml b/playbooks/roles/oci-cloud-agent/tasks/el.yml index 5b4993ed..2011c434 100644 --- a/playbooks/roles/oci-cloud-agent/tasks/el.yml +++ b/playbooks/roles/oci-cloud-agent/tasks/el.yml @@ -1,4 +1,9 @@ --- + +- name: call oracle cloud agent updater role + include_role: + name: oci-cloud-agent-updater + - name: Check if agent.yml exists stat: path: /etc/oracle-cloud-agent/agent.yml @@ -31,6 +36,35 @@ dest: /etc/oracle-cloud-agent/agent.yml when: agent_file.stat.exists + +- name: Install yum-plugin-versionlock for OL8 + yum: + name: python3-dnf-plugin-versionlock + state: latest + disable_gpg_check: yes + when: + - ansible_os_family == 'RedHat' and ansible_distribution_major_version == '8' +- name: Hold latest OCA for OL + community.general.yum_versionlock: + name: oracle-cloud-agent + state: present + when: + - ansible_os_family == 'RedHat' + +- name: Disable OCA updater + replace: + path: /etc/oracle-cloud-agent/updater.yml + regexp: 'upgrade_interval: 3600' + replace: 'upgrade_interval: -1' + +- name: Restart cloud agent updater + service: + name: oracle-cloud-agent-updater + state: restarted + retries: 5 + register: restart_cloud_agent_updater + until: restart_cloud_agent_updater is not failed + - name: Restart cloud agent service: name: oracle-cloud-agent @@ -79,4 +113,4 @@ - name: Kill Yum Process shell: "sudo kill -9 {{yum_process.stdout}}" when: yum_process.stdout != "" - ignore_errors: True \ No newline at end of file + ignore_errors: True diff --git a/playbooks/roles/oci-cloud-agent/tasks/main.yml b/playbooks/roles/oci-cloud-agent/tasks/main.yml index cbceb221..de4b8936 100644 --- a/playbooks/roles/oci-cloud-agent/tasks/main.yml +++ b/playbooks/roles/oci-cloud-agent/tasks/main.yml @@ -1,4 +1,4 @@ - include_tasks: el.yml when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' - include_tasks: ubuntu.yml - when: ansible_os_family == 'Debian' + when: ansible_os_family == 'Debian' \ No newline at end of file diff --git a/playbooks/roles/oci-cloud-agent/tasks/ubuntu.yml b/playbooks/roles/oci-cloud-agent/tasks/ubuntu.yml index 14a7622a..35cb29b6 100644 --- a/playbooks/roles/oci-cloud-agent/tasks/ubuntu.yml +++ b/playbooks/roles/oci-cloud-agent/tasks/ubuntu.yml @@ -1,14 +1,10 @@ --- -- block: - - - - name: debug - debug: - msg: "no change for ubuntu" +- name: call oracle cloud agent updater + include_role: + name: oci-cloud-agent-updater # oracle-cloud-agent runs by default on ubuntu # However it doesn't have osms (OS Management Service) which we had to disable on Oracle Linux # sudo snap list | grep oracle-cloud-agent # to check for version -# snap info oracle-cloud-agent - +# snap info oracle-cloud-agent \ No newline at end of file diff --git a/playbooks/roles/openldap/tasks/el.yml b/playbooks/roles/openldap/tasks/el.yml index 94493978..515a42e8 100644 --- a/playbooks/roles/openldap/tasks/el.yml +++ b/playbooks/roles/openldap/tasks/el.yml @@ -33,6 +33,7 @@ name: https://vault.centos.org/centos/8/PowerTools/x86_64/os/Packages/openldap-servers-2.4.46-18.el8.x86_64.rpm state: present disable_gpg_check: true + when: ansible_distribution_major_version == '8' - name: Generate openldap root password diff --git a/playbooks/roles/openldap/vars/el_vars.yml b/playbooks/roles/openldap/vars/el_vars.yml index 5a335226..21bbf972 100644 --- a/playbooks/roles/openldap/vars/el_vars.yml +++ b/playbooks/roles/openldap/vars/el_vars.yml @@ -7,11 +7,12 @@ openldap_packages: - rsync openldap_packages_ol8: - - openldap-clients + - openldap-2.4.46-18.el8.x86_64 + - openldap-clients-2.4.46-18.el8.x86_64 - rsync openldap_default_user: ldap openldap_default_group: ldap openldap_server_conf_path: /etc/openldap/slapd.d -openldap_server_defaults_file: /etc/sysconfig/slapd \ No newline at end of file +openldap_server_defaults_file: /etc/sysconfig/slapd diff --git a/playbooks/roles/prometheus/defaults/main.yml b/playbooks/roles/prometheus/defaults/main.yml new file mode 100644 index 00000000..31ee1e3d --- /dev/null +++ b/playbooks/roles/prometheus/defaults/main.yml @@ -0,0 +1,15 @@ +--- +# vars file for prometheus +prometheus_user: prometheus +prometheus_dest_dir: /etc/prometheus +prometheus_data_dir: /var/lib/prometheus +service_dest_dir: /etc/systemd/system +prometheus_download_dir: /tmp/prometheus-2.53.1.linux-amd64 +prometheus_download_url: https://github.com/prometheus/prometheus/releases/download/v2.53.1/prometheus-2.53.1.linux-amd64.tar.gz +# vars file for prometheus + +exporter_ports: + - "9100" + - "9400" + - "9500" + - "9600" diff --git a/playbooks/roles/prometheus/files/alert.rules.yml b/playbooks/roles/prometheus/files/alert.rules.yml new file mode 100644 index 00000000..cb6eb09d --- /dev/null +++ b/playbooks/roles/prometheus/files/alert.rules.yml @@ -0,0 +1,34 @@ +groups: +- name: InstanceAvailability + rules: + - alert: InstanceDown + expr: up{instance=~".*9100"}==0 + for: 5m + labels: + severity: critical + annotations: + summary: "Instance {{ $labels.hostname }} is down" + description: "The instance {{ $labels.hostname }} has been unreachable for more than 5 minutes." + +- name: HostResourceUsage + rules: + - alert: HighMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage on {{ $labels.hostname }}" + description: "Memory usage is above 90% (currently at {{ printf \"%.2f\" $value }}%) on instance {{ $labels.hostname }}." + +- name: GPUAlerts + rules: + - alert: GPUFallenOffBus + expr: increase(dcgm_gpu_errors_xid_count{XID="79"}[5m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "GPU Fallen Off the Bus on {{ $labels.instance }} (GPU {{ $labels.gpu }})" + description: "GPU {{ $labels.gpu }} on instance {{ $labels.instance }} has reported XID error 79 (GPU fallen off the bus). Immediate attention is required." + diff --git a/playbooks/roles/prometheus/tasks/gather_info.yml b/playbooks/roles/prometheus/tasks/gather_info.yml new file mode 100644 index 00000000..77a56439 --- /dev/null +++ b/playbooks/roles/prometheus/tasks/gather_info.yml @@ -0,0 +1,72 @@ +- name: Gather serial number + command: "sudo dmidecode -s system-serial-number" + register: serial_output + delegate_to: "{{ item }}" + +- name: Gather FSS IP + shell: "cat /etc/fstab | grep {{nfs_source_path}}" + register: nfs_output + delegate_to: "{{ item }}" + ignore_errors: yes + +- name: Extract the IP address using regex + set_fact: + ip_address: "{{ nfs_output.stdout | regex_search('([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3})') }}" + when: nfs_output.rc == 0 + +- name: Extract the IP address using regex + set_fact: + ip_address: "None" + when: nfs_output.rc != 0 + +- name: gather metadata + uri: + url: http://169.254.169.254/opc/v2/instance/ + method: GET + headers: + Authorization: 'Bearer Oracle' + return_content: yes + register: i_metadata + delegate_to: "{{ item }}" + +- name: set fact + set_fact: + instance_metadata: "{{ i_metadata['content'] }}" + +- name: make sure cluster_name in included + set_fact: + cluster_name: "{{ instance_metadata['displayName'].split('-') | slice(2, 10) | join('-') }}" + when: "not 'cluster_name' in instance_metadata['freeformTags']" + +- name: make sure cluster_name in included + set_fact: + cluster_name: "{{instance_metadata['freeformTags']['cluster_name']}}" + when: "'cluster_name' in instance_metadata['freeformTags']" + +- name: gather RDMA metadata + uri: + url: http://169.254.169.254/opc/v1/host/ + method: GET + headers: + Authorization: 'Bearer Oracle' + return_content: yes + register: h_metadata + delegate_to: "{{ item }}" + +- name: set fact + set_fact: + rdma_metadata: "{{ h_metadata['content'] }}" + +- name: add empty networkblockID + set_fact: + rdma_metadata : "{{ rdma_metadata | combine({'networkBlockId': 'None'}) }}" + when: "not 'networkBlockId' in rdma_metadata" + +- name: add empty rdmaTopologyData + set_fact: + rdma_metadata : "{{ rdma_metadata | combine({'rdmaTopologyData': {'customerHPCIslandId': 'None','customerLocalBlock': 'None','customerNetworkBlock': 'None'}}) }}" + when: "not 'rdmaTopologyData' in rdma_metadata" + +- name: Build the host_info dictionary + set_fact: + host_info: "{{ host_info | default({}) | combine({item: {'serial_number': serial_output.stdout, 'cluster_name': cluster_name, 'shape': instance_metadata['shape'] , 'ocid': instance_metadata['id'] , 'oci_name': instance_metadata['displayName'], 'availabilityDomain': instance_metadata['availabilityDomain'],'compartmentId': instance_metadata['compartmentId'],'rackID': rdma_metadata['rackId'],'networkBlockId': rdma_metadata['networkBlockId'],'rail_id': rdma_metadata['rdmaTopologyData']['customerLocalBlock'],'fss_ip': ip_address,'hpc_island': rdma_metadata['rdmaTopologyData']['customerHPCIslandId'] }}) }}" \ No newline at end of file diff --git a/playbooks/roles/prometheus/tasks/main.yml b/playbooks/roles/prometheus/tasks/main.yml new file mode 100644 index 00000000..efbbbdb3 --- /dev/null +++ b/playbooks/roles/prometheus/tasks/main.yml @@ -0,0 +1,151 @@ +--- +# tasks file for prometheus +- name: Create user for prometheus + become: true + user: + name: "{{ prometheus_user }}" # Username + state: present # Ensure the user is present + createhome: no # Create the user's home directory + shell: /usr/sbin/nologin + append: yes + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" + +- name: Create installation folder in etc + become: true + file: + path: "{{ prometheus_dest_dir }}" + state: directory + owner: "{{ prometheus_user }}" + group: "{{ prometheus_user }}" + mode: '0775' + recurse: yes + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" + + +- name: Create data folder + become: true + file: + path: "{{ prometheus_data_dir }}" + state: directory + owner: "{{ prometheus_user }}" + group: "{{ prometheus_user }}" + mode: '0775' + recurse: yes + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" + +- name: Download/unarchive Packages for prometheus + become: true + unarchive: + src: "{{ prometheus_download_url }}" + dest: /tmp + owner: "{{ prometheus_user }}" + group: "{{ prometheus_user }}" + remote_src: true + creates: "{{ prometheus_download_dir }}" + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" + +- name: Copying the service binary for prometheus + become: true + copy: + src: "{{ prometheus_download_dir }}/" + dest: /usr/local/bin/ + owner: "{{ prometheus_user }}" + group: "{{ prometheus_user }}" + mode: '0775' + remote_src: true + with_items: + - prometheus + - promtool + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" + +- name: Copying the console binary + become: true + copy: + src: "{{ prometheus_download_dir }}/" + dest: "{{ prometheus_dest_dir }}" + owner: "{{ prometheus_user }}" + group: "{{ prometheus_user }}" + mode: '0775' + remote_src: true + with_items: + - consoles + - console_libraries + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" + +- name: Copying default alert rules + become: true + copy: + src: files/alert.rules.yml + dest: /etc/prometheus + owner: "{{ prometheus_user }}" + group: "{{ prometheus_user }}" + mode: '0775' + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" + +- name: Removing the tar file of prometheus + become: true + file: + path: "{{ prometheus_download_dir }}" + state: absent + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" + +- name: Create prometheus systemd service file + become: true + template: + src: templates/prometheus.service.j2 + dest: "{{ service_dest_dir }}/prometheus.service" + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" + +- name: Get current nodes in /etc/hosts + shell: "cat /etc/hosts | grep .local.vcn | awk '{print $2}'" + register: nodes_in_etc_hosts + delegate_to: 127.0.0.1 + run_once: true + ignore_errors: yes + +- name: Get controller and login nodes + shell: "cat /etc/hosts | grep \"controller\\|login\" | grep -v \"ANSIBLE MANAGED BLOCK\" | awk '{print $3}'" + register: c_l_nodes_in_etc_hosts + delegate_to: 127.0.0.1 + run_once: true + ignore_errors: yes + +- name: set fact + set_fact: + nodelist: "{{ nodes_in_etc_hosts.stdout_lines + c_l_nodes_in_etc_hosts.stdout_lines }}" + run_once: true + +- name: Loop over the list of hosts and gather serial number and cluster name + loop: "{{ nodelist }}" + include_tasks: gather_info.yml + run_once: true + +- name: Copying the scrape config + become: true + template: + src: templates/prometheus.yml.j2 + dest: "{{ prometheus_dest_dir }}/prometheus.yml" + owner: "{{ prometheus_user }}" + group: "{{ prometheus_user }}" + mode: '0775' + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" + +- name: restart prometheus + become: true + service: + name: prometheus + state: restarted + daemon_reload: yes + enabled: yes + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" diff --git a/playbooks/roles/prometheus/templates/prometheus.conf.j2 b/playbooks/roles/prometheus/templates/prometheus.conf.j2 new file mode 100644 index 00000000..8fcbb7e9 --- /dev/null +++ b/playbooks/roles/prometheus/templates/prometheus.conf.j2 @@ -0,0 +1,8 @@ +--- +# handlers file for prometheus +- name: restart prometheus + service: + name: prometheus + state: restarted + daemon_reload: yes + enabled: yes \ No newline at end of file diff --git a/playbooks/roles/prometheus/templates/prometheus.service.j2 b/playbooks/roles/prometheus/templates/prometheus.service.j2 new file mode 100644 index 00000000..2fdd295d --- /dev/null +++ b/playbooks/roles/prometheus/templates/prometheus.service.j2 @@ -0,0 +1,18 @@ +[Unit] +Description=Prometheus +Wants=network-online.target +After=network-online.target + +[Service] +User={{ prometheus_user }} +Group={{ prometheus_user }} +Type=simple +ExecStart=/usr/local/bin/prometheus \ + --config.file /etc/prometheus/prometheus.yml \ + --storage.tsdb.path /var/lib/prometheus/ \ + --web.console.templates=/etc/prometheus/consoles \ + --web.console.libraries=/etc/prometheus/console_libraries \ + --web.listen-address=:9090 + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/playbooks/roles/prometheus/templates/prometheus.yml.j2 b/playbooks/roles/prometheus/templates/prometheus.yml.j2 new file mode 100644 index 00000000..2cd79409 --- /dev/null +++ b/playbooks/roles/prometheus/templates/prometheus.yml.j2 @@ -0,0 +1,27 @@ +global: + scrape_interval: 60s + external_labels: + monitor: 'prometheus' + +rule_files: + - alert.rules.yml + +scrape_configs: + - job_name: 'prometheus' + static_configs: +{% for host, info in host_info.items() %} + - targets: [{% for port in exporter_ports %}{{ host }}:{{port}}{{ "," if not loop.last }}{%- endfor %}] + labels: + serial: {{ info.serial_number }} + hostname: {{ host }} + cluster_name: {{ info.cluster_name }} + ocid: {{ info.ocid }} + oci_name: {{ info.oci_name }} + AD: {{ info.availabilityDomain }} + compartment: {{ info.compartmentId }} + rackID: {{ info.rackID }} + networkBlockId: {{ info.networkBlockId }} + rail_id: {{ info.rail_id }} + hpc_island: {{ info.hpc_island }} + fss_mount: {{ info.fss_ip }} +{% endfor %} diff --git a/playbooks/roles/rack-aware/files/node_ordering_by_rack.py b/playbooks/roles/rack-aware/files/node_ordering_by_rack.py index 027709e5..25923c0d 100644 --- a/playbooks/roles/rack-aware/files/node_ordering_by_rack.py +++ b/playbooks/roles/rack-aware/files/node_ordering_by_rack.py @@ -53,10 +53,14 @@ def write_ordered_rankfile(ordered_hosts=[],hostfile=None): output = client.run_command('curl http://169.254.169.254/opc/v1/host/') for host_out in output: j = json.loads(bytearray(''.join(list(host_out.stdout)).encode())) - if j['rackId'] in r: - r[j['rackId']].append( host_out.host ) + try: + rackID=j['rdmaTopologyData']['customerLocalBlock'] + except: + rackID = j['rackId'] + if rackID in r: + r[rackID].append( host_out.host ) else: - r[j['rackId']] = [ host_out.host ] + r[rackID] = [ host_out.host ] hostname_output = client.run_command('/usr/bin/hostname') for host_out in hostname_output: j = bytearray(''.join(list(host_out.stdout)).encode()) diff --git a/playbooks/roles/slurm/defaults/main.yml b/playbooks/roles/slurm/defaults/main.yml index a4983e9c..049d719a 100755 --- a/playbooks/roles/slurm/defaults/main.yml +++ b/playbooks/roles/slurm/defaults/main.yml @@ -9,4 +9,4 @@ slurm_uid: 1501 munge_gid: 1500 munge_uid: 1500 rack_aware_playbook_suffix: "{% if rack_aware|bool %}-rack-aware{% endif%}" -slurm_version: "23.02.5-1" \ No newline at end of file +slurm_version: "24.05.1-1" \ No newline at end of file diff --git a/playbooks/roles/slurm/files/cgroup.conf b/playbooks/roles/slurm/files/cgroup.conf index 57b5c5a2..568493f0 100755 --- a/playbooks/roles/slurm/files/cgroup.conf +++ b/playbooks/roles/slurm/files/cgroup.conf @@ -1,4 +1,3 @@ CgroupMountpoint="/sys/fs/cgroup" -CgroupAutomount=yes ConstrainDevices=yes ConstrainCores=yes \ No newline at end of file diff --git a/playbooks/roles/slurm/files/healthchecks.sh b/playbooks/roles/slurm/files/healthchecks.sh index d54dd837..5d69485c 100644 --- a/playbooks/roles/slurm/files/healthchecks.sh +++ b/playbooks/roles/slurm/files/healthchecks.sh @@ -1,11 +1,16 @@ #!/bin/sh shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` -if [ "${shape}" = \"BM.GPU.H100.8\" ] || [ "${shape}" == \"BM.GPU.A100-v2.8\" ] || [ "${shape}" == \"BM.GPU4.8\" ] || [ "${shape}" == \"BM.GPU.B4.8\" ] +if [ "${shape}" = \"BM.GPU.H100.8\" ] || [ "${shape}" = \"BM.GPU.A100-v2.8\" ] || [ "${shape}" = \"BM.GPU4.8\" ] || [ "${shape}" = \"BM.GPU.B4.8\" ] then sudo python3 /opt/oci-hpc/healthchecks/check_gpu_setup.py --slurm > /tmp/latest_healthcheck.log 2>&1 DRAIN_MSG=`cat /tmp/latest_healthcheck.log | grep "Healthcheck::"` if [ "$DRAIN_MSG" != "" ] then - scontrol update nodename=`hostname` state=drain reason="${DRAIN_MSG}" + if [ -n "$SLURM_JOB_ID" ]; then + echo "${DRAIN_MSG}" + exit 1 + else + scontrol update nodename=`hostname` state=drain reason="${DRAIN_MSG}" + fi fi fi diff --git a/playbooks/roles/slurm/tasks/backup_server.yml b/playbooks/roles/slurm/tasks/backup_server.yml index 1dbea29c..fbeaf7a8 100755 --- a/playbooks/roles/slurm/tasks/backup_server.yml +++ b/playbooks/roles/slurm/tasks/backup_server.yml @@ -110,6 +110,17 @@ retries: 10 delay: 5 +- name: move gres.conf on backup servers + become: true + copy: + dest: '{{ slurm_conf_path }}/gres.conf' + src: '{{ slurm_conf_path }}/gres.conf' + force: yes + register: gres_copied + until: gres_copied is not failed + retries: 10 + delay: 5 + - name: start slurmctld become: true service: diff --git a/playbooks/roles/slurm/tasks/cleanup.yml b/playbooks/roles/slurm/tasks/cleanup.yml index 8bc3f6f0..b3beee12 100755 --- a/playbooks/roles/slurm/tasks/cleanup.yml +++ b/playbooks/roles/slurm/tasks/cleanup.yml @@ -3,3 +3,10 @@ file: path: /tmp/munge.key state: absent + +- name: Reconfigure Slurm + become: true + command: "scontrol reconfigure" + delegate_to: 127.0.0.1 + run_once: true + ignore_errors: yes \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index 865312ca..76a6f2bd 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -61,35 +61,104 @@ run_once: true when: download_path != '/tmp' -- name: Download slurm .deb - get_url: - url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" - dest: "{{ download_path }}/slurm_rpms/" - when: ansible_os_family == 'Debian' and download_path == '/tmp' - -- name: Download slurm .rpm - get_url: - url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/{{ item }}" - dest: "{{ download_path }}/slurm_rpms" - with_items: "{{slurm_all_packages}}" - when: ansible_os_family == 'RedHat' and download_path == '/tmp' - -- name: Download slurm .deb - get_url: - url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" - dest: "{{ download_path }}/slurm_rpms" - when: ansible_os_family == 'Debian' and download_path != '/tmp' - delegate_to: 127.0.0.1 - run_once: true +- name: Download Slurm DEB + block: + - name: Download in shared location + block: + - name: Check if the deb file exists + stat: + path: "{{ download_path }}/slurm_rpms/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" + register: slurm_dwld + delegate_to: 127.0.0.1 + run_once: true + - name: First try downloading .deb + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/lQBJduG5m6xQWauRMPkouaOoliEHbbDIqgfQXou050XAcDav2UveFlRjvFv-0JIn/n/hpc_limited_availability/b/source/o/slurm/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" + dest: "{{ download_path }}/slurm_rpms" + when: ( not slurm_dwld.stat.exists ) + delegate_to: 127.0.0.1 + run_once: true + when: download_path != '/tmp' -- name: Download slurm .rpm - get_url: - url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/{{ item }}" - dest: "{{ download_path }}/slurm_rpms" - with_items: "{{slurm_all_packages}}" - when: ansible_os_family == 'RedHat' and download_path != '/tmp' - delegate_to: 127.0.0.1 - run_once: true + - name: Download on each node + block: + - name: Check if the deb file exists in /tmp + stat: + path: "{{ download_path }}/slurm_rpms/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" + register: slurm_dwld + - name: First try downloading .deb in /tmp + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/lQBJduG5m6xQWauRMPkouaOoliEHbbDIqgfQXou050XAcDav2UveFlRjvFv-0JIn/n/hpc_limited_availability/b/source/o/slurm/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" + dest: "{{ download_path }}/slurm_rpms" + when: ( not slurm_dwld.stat.exists ) + when: download_path == '/tmp' + rescue: + - name: Second try downloading .deb + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" + dest: "{{ download_path }}/slurm_rpms" + when: download_path != '/tmp' + delegate_to: 127.0.0.1 + run_once: true + - name: Second try downloading .deb in /tmp + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" + dest: "{{ download_path }}/slurm_rpms" + when: download_path == '/tmp' + when: ansible_os_family == 'Debian' + +- name: Download Slurm RPMs + block: + - name: Download on each node + block: + - name: Check if the RPMs exist in tmp + stat: + path: "{{ download_path }}/slurm_rpms/{{item}}" + register: slurm_dwld_tmp + with_items: "{{slurm_all_packages}}" + - name: First try downloading RPMs in tmp + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/lQBJduG5m6xQWauRMPkouaOoliEHbbDIqgfQXou050XAcDav2UveFlRjvFv-0JIn/n/hpc_limited_availability/b/source/o/slurm/{{ item.item }}" + dest: "{{ download_path }}/slurm_rpms" + when: ( not item.stat.exists ) + with_items: "{{slurm_dwld_tmp.results}}" + when: download_path == '/tmp' + + - name: Download in shared location + block: + - name: Check if the RPMs exist + stat: + path: "{{ download_path }}/slurm_rpms/{{item}}" + register: slurm_dwld + delegate_to: 127.0.0.1 + run_once: true + with_items: "{{slurm_all_packages}}" + - name: First try downloading RPMs + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/lQBJduG5m6xQWauRMPkouaOoliEHbbDIqgfQXou050XAcDav2UveFlRjvFv-0JIn/n/hpc_limited_availability/b/source/o/slurm/{{ item.item }}" + dest: "{{ download_path }}/slurm_rpms" + when: ( not item.stat.exists ) + delegate_to: 127.0.0.1 + run_once: true + with_items: "{{slurm_dwld.results}}" + when: download_path != '/tmp' + + rescue: + - name: Second try downloading RPMs + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" + dest: "{{ download_path }}/slurm_rpms" + when: download_path != '/tmp' + delegate_to: 127.0.0.1 + run_once: true + with_items: "{{slurm_all_packages}}" + - name: Second try downloading RPMs in tmp + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" + dest: "{{ download_path }}/slurm_rpms" + when: download_path == '/tmp' + with_items: "{{slurm_all_packages}}" + when: ansible_os_family == 'RedHat' - name: Install .deb vars: @@ -190,7 +259,7 @@ include_tasks: common_pyxis.yml when: pyxis|bool -- name: Include pyxis prolog files +- name: Include pmix prolog files include_tasks: common_pmix.yml when: ansible_os_family == 'RedHat' diff --git a/playbooks/roles/slurm/tasks/common_pmix.yml b/playbooks/roles/slurm/tasks/common_pmix.yml index 630e2530..2687c96e 100644 --- a/playbooks/roles/slurm/tasks/common_pmix.yml +++ b/playbooks/roles/slurm/tasks/common_pmix.yml @@ -11,22 +11,59 @@ name: safe_yum when: ansible_os_family == 'RedHat' -- name: Download slurm .rpm - get_url: - url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/tgnPgvPv68JpWqLklTNY86rBsJ0z7Ebp3zs7Ud4X2_R8TZFgpm26kh08QHKI3dXU/n/hpc/b/source/o/pmix/{{item}}" - dest: "{{ download_path }}/slurm_rpms" - when: ansible_os_family == 'RedHat' and download_path == '/tmp' - with_items: "{{pmix_download_packages}}" +- name: Download Slurm RPMs + block: + - name: Download on each node + block: + - name: Check if the RPMs exist in tmp + stat: + path: "{{ download_path }}/slurm_rpms/{{item}}" + register: pmix_dwld_tmp + with_items: "{{pmix_download_packages}}" + - name: First try downloading RPMs in tmp + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/lQBJduG5m6xQWauRMPkouaOoliEHbbDIqgfQXou050XAcDav2UveFlRjvFv-0JIn/n/hpc_limited_availability/b/source/o/pmix/{{ item.item }}" + dest: "{{ download_path }}/slurm_rpms" + when: ( not item.stat.exists ) + with_items: "{{pmix_dwld_tmp.results}}" + when: download_path == '/tmp' -- name: Download slurm .rpm - get_url: - url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/tgnPgvPv68JpWqLklTNY86rBsJ0z7Ebp3zs7Ud4X2_R8TZFgpm26kh08QHKI3dXU/n/hpc/b/source/o/pmix/{{item}}" - dest: "{{ download_path }}/slurm_rpms" - when: ansible_os_family == 'RedHat' and download_path != '/tmp' - delegate_to: 127.0.0.1 - run_once: true - with_items: "{{ pmix_download_packages }}" + - name: Download in shared location + block: + - name: Check if the RPMs exist + stat: + path: "{{ download_path }}/slurm_rpms/{{item}}" + register: pmix_dwld + delegate_to: 127.0.0.1 + run_once: true + with_items: "{{pmix_download_packages}}" + - name: First try downloading RPMs + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/lQBJduG5m6xQWauRMPkouaOoliEHbbDIqgfQXou050XAcDav2UveFlRjvFv-0JIn/n/hpc_limited_availability/b/source/o/pmix/{{ item.item }}" + dest: "{{ download_path }}/slurm_rpms" + when: ( not item.stat.exists ) + delegate_to: 127.0.0.1 + run_once: true + with_items: "{{pmix_dwld.results}}" + when: download_path != '/tmp' + + rescue: + - name: Second try downloading RPMs + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/tgnPgvPv68JpWqLklTNY86rBsJ0z7Ebp3zs7Ud4X2_R8TZFgpm26kh08QHKI3dXU/n/hpc/b/source/o/pmix/{{item}}" + dest: "{{ download_path }}/slurm_rpms" + when: download_path != '/tmp' + delegate_to: 127.0.0.1 + run_once: true + with_items: "{{pmix_download_packages}}" + - name: Second try downloading RPMs in tmp + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/tgnPgvPv68JpWqLklTNY86rBsJ0z7Ebp3zs7Ud4X2_R8TZFgpm26kh08QHKI3dXU/n/hpc/b/source/o/pmix/{{item}}" + dest: "{{ download_path }}/slurm_rpms" + when: download_path == '/tmp' + with_items: "{{pmix_download_packages}}" + when: ansible_os_family == 'RedHat' - name: install PMIx packages RedHat vars: diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml index 0621555d..4bbf6114 100755 --- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml @@ -11,7 +11,7 @@ - name: Render systemd units for slurm, slurmdbd and munge become: true template: - src: 'systemd/{{ item }}.service' + src: 'systemd/{{ item }}.service.j2' dest: '/lib/systemd/system/{{ item }}.service' backup: "yes" with_items: @@ -77,6 +77,16 @@ state: restarted enabled: true +- name: Add to the gres.conf file on the controller. + become: true + blockinfile: + block: "{{ lookup('template', 'gres.conf.j2') }}" + path: "{{ slurm_conf_path }}/gres.conf" + marker: "### {mark} {{ansible_hostname}}" + throttle: 1 + delegate_to: 127.0.0.1 + when: "'GPU' in shape" + - name: Run the script to get the RackID shell: 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v1/host | jq .rackId' # shell: echo $RANDOM | md5sum | head -c 20 @@ -85,22 +95,37 @@ delay: 5 until: rackID_script is not failed -- name: Set RackID fact +- name: H100 Block + block: + - name: Run the script to get the RDMA Block ID + shell: 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v1/host/rdmaTopologyData/customerLocalBlock | grep ocid' + # shell: echo $RANDOM | md5sum | head -c 20 + register: blockID_script + retries: 3 + delay: 1 + ignore_errors: true + until: blockID_script is not failed + + - name: Set BlockID fact + set_fact: + rackID: "{{ blockID_script.stdout.split('.')[4][-16:-1]}}" + when: blockID_script is not failed + + - name: Set RackID fact + set_fact: + rackID: "{{ rackID_script.stdout[1:-41]}}" + when: blockID_script is failed + when: shape == 'BM.GPU.H100.8' + +- name: Set Rack ID fact on nodes other than H100 set_fact: rackID: "{{ rackID_script.stdout[1:-41]}}" - -- name: Get nodes from Inactive Switch - vars: - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" - shell: "cat {{ slurm_conf_path }}/topology.conf | grep \"SwitchName=inactive-{{queue}}-{{keyword}}\"" - register: inactive_switch - run_once: true - delegate_to: 127.0.0.1 + when: shape != 'BM.GPU.H100.8' - name: Get rackIDs for all compute nodes set_fact: racks_to_add_temp: "{{cluster_name}}:{{hostvars[item]['rackID']}}" - with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login'])}}" + with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login'])| difference(groups['monitoring'])}}" run_once: true register: racks_to_add_temp_results @@ -111,7 +136,7 @@ - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) }}" + with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring'])}}" run_once: true register: nodes_to_add_temp_results @@ -138,7 +163,7 @@ - name: Get hostlist if switch exists vars: - new_line: "{% for node in ( play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" + new_line: "{% for node in ( play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring'])) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" command: "scontrol show hostlistsorted {{ item.stdout_lines | union (new_line[:-1].split(',') | list )| join(',') }}" register: rack_hostlist1 delegate_to: 127.0.0.1 @@ -148,7 +173,7 @@ - name: Get hostlist if switch does not exists vars: - new_line: "{% for node in ( play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" + new_line: "{% for node in ( play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring'])) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" command: "scontrol show hostlistsorted {{ new_line[:-1] }}" register: rack_hostlist2 delegate_to: 127.0.0.1 @@ -184,32 +209,6 @@ delegate_to: 127.0.0.1 notify: reconfigure slurm -- name: Get inactive_hostlist - vars: - - inactive_list_condensed: "{{inactive_switch.stdout.split('Nodes=')[1]}}" - command: "scontrol show hostname {{inactive_list_condensed }}" - register: inactive_hostlist - delegate_to: 127.0.0.1 - -- name: Create new inactive_hostlist - command: "scontrol show hostlistsorted {{inactive_hostlist.stdout_lines | difference(nodes_to_add) | join(',') }}" - register: inactive_hostlist_condensed - delegate_to: 127.0.0.1 - -- name: remove nodes from inactive - become: true - vars: - - inactive_list: "{{inactive_switch.stdout.split('Nodes=')[1].split(',')}}" - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" - lineinfile: - path: "{{ slurm_conf_path }}/topology.conf" - regexp: "SwitchName=inactive-{{queue}}-{{keyword}}\\sNodes.*" - line: "SwitchName=inactive-{{queue}}-{{keyword}} Nodes={{inactive_hostlist_condensed.stdout}}" - state: present - run_once: true - delegate_to: 127.0.0.1 - notify: reconfigure slurm - - name: Get racks in the Cluster block: - name: Get Racks from topology.conf @@ -234,6 +233,7 @@ state: present run_once: true delegate_to: 127.0.0.1 + notify: reconfigure slurm when: racks_left_list | length > 0 - name: Run Pam settings @@ -250,41 +250,12 @@ state: restarted enabled: true +- name: Give some time to the slurmd to start + pause: + seconds: 10 -- name: Update node state on controller - block: - - name: Grab Node State - shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' - register: node_state - delegate_to: 127.0.0.1 - - set_fact: - node_state2: "{{ node_state.stdout }}" - - name: Update node state on controller - become: true - command: scontrol update nodename={{ ansible_hostname }} state=RESUME - when: node_state2 != "idle" and node_state2 != "alloc" - rescue: - - name: Sleep 5 seconds - pause: - seconds: 10 - - - name: Grab Node State - shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' - register: node_state - delegate_to: 127.0.0.1 - until: node_state.stdout.find("failure") == -1 - retries: 10 - delay: 5 - - - set_fact: - node_state2: "{{ node_state.stdout }}" - - - name: Update node state on controller - become: true - command: scontrol update nodename={{ ansible_hostname }} state=RESUME - when: node_state2 != "idle" and node_state2 != "alloc" - register: result - retries: 10 - delay: 5 - until: result is not failed - delegate_to: 127.0.0.1 \ No newline at end of file +- name: Reconfigure Slurm for topology + become: true + command: "scontrol reconfigure" + delegate_to: 127.0.0.1 + run_once: true \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/compute.yml b/playbooks/roles/slurm/tasks/compute.yml index 8f22bfa8..83d166ed 100755 --- a/playbooks/roles/slurm/tasks/compute.yml +++ b/playbooks/roles/slurm/tasks/compute.yml @@ -14,7 +14,7 @@ - name: Render systemd units for slurmd become: true template: - src: 'systemd/{{ item }}.service' + src: 'systemd/{{ item }}.service.j2' dest: '/lib/systemd/system/{{ item }}.service' backup: "yes" with_items: @@ -80,10 +80,20 @@ state: restarted enabled: true +- name: Add to the gres.conf file on the controller. + become: true + blockinfile: + block: "{{ lookup('template', 'gres.conf.j2') }}" + path: "{{ slurm_conf_path }}/gres.conf" + marker: "### {mark} {{ansible_hostname}}" + throttle: 1 + delegate_to: 127.0.0.1 + when: "'GPU' in shape" + - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) }}" + with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring']) }}" run_once: true register: nodes_to_add_temp_results @@ -113,14 +123,6 @@ register: cluster_hostlist_condensed_results delegate_to: 127.0.0.1 -- name: Get nodes from Inactive Switch - vars: - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" - shell: "cat {{ slurm_conf_path }}/topology.conf | grep \"SwitchName=inactive-{{queue}}-{{keyword}} \"" - register: inactive_switch - run_once: true - delegate_to: 127.0.0.1 - - name: add nodes to Switch become: true @@ -133,31 +135,6 @@ delegate_to: 127.0.0.1 notify: reconfigure slurm -- name: Get inactive_hostlist - vars: - - inactive_list_condensed: "{{inactive_switch.stdout.split('Nodes=')[1]}}" - command: "scontrol show hostname {{inactive_list_condensed }}" - register: inactive_hostlist - delegate_to: 127.0.0.1 - -- name: Create new inactive_hostlist - command: "scontrol show hostlistsorted {{inactive_hostlist.stdout_lines | difference(nodes_to_add) | join(',') }}" - register: inactive_hostlist_condensed - delegate_to: 127.0.0.1 - -- name: remove nodes from inactive - become: true - vars: - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" - lineinfile: - path: "{{ slurm_conf_path }}/topology.conf" - regexp: "SwitchName=inactive-{{queue}}-{{keyword}}\\sNodes.*" - line: "SwitchName=inactive-{{queue}}-{{keyword}} Nodes={{inactive_hostlist_condensed.stdout }}" - state: present - run_once: true - delegate_to: 127.0.0.1 - notify: reconfigure slurm - - name: Run Pam settings include_tasks: compute_pam.yml when: pam|bool @@ -168,41 +145,9 @@ name: slurmd state: restarted enabled: true - -- name: Update node state on controller - block: - - name: Grab Node State - shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' - register: node_state - delegate_to: 127.0.0.1 - - set_fact: - node_state2: "{{ node_state.stdout }}" - - name: Update node state on controller - become: true - command: scontrol update nodename={{ ansible_hostname }} state=RESUME - when: node_state2 != "idle" and node_state2 != "alloc" - rescue: - - name: Sleep 5 seconds - pause: - seconds: 10 - - - name: Grab Node State - shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' - register: node_state - delegate_to: 127.0.0.1 - until: node_state.stdout.find("failure") == -1 - retries: 10 - delay: 5 - - - set_fact: - node_state2: "{{ node_state.stdout }}" - - - name: Update node state on controller - become: true - command: scontrol update nodename={{ ansible_hostname }} state=RESUME - when: node_state2 != "idle" and node_state2 != "alloc" - register: result - retries: 10 - delay: 5 - until: result is not failed - delegate_to: 127.0.0.1 \ No newline at end of file + +- name: Reconfigure Slurm for topology + become: true + command: "scontrol reconfigure" + delegate_to: 127.0.0.1 + run_once: true \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml index 1bc888a0..0902f797 100755 --- a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml @@ -6,88 +6,63 @@ ignore_unreachable: True delegate_to: 127.0.0.1 -- name: Get nodes from Inactive Switch - block: - - name: Get nodes from Inactive Switch - vars: - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" - shell: "cat {{ slurm_conf_path }}/topology.conf | grep \"SwitchName=inactive-{{queue}}-{{keyword}}\"" - register: inactive_switch_condensed - run_once: true - delegate_to: 127.0.0.1 - - name: Get inactive list - command: "scontrol show hostname {{inactive_switch_condensed.stdout.split('Nodes=')[1] }}" - register: inactive_switch - run_once: true - delegate_to: 127.0.0.1 - - name: Create inactive list - set_fact: - inactive_list: "{{inactive_switch.stdout_lines}}" - rescue: - - name: Create inactive cluster list - set_fact: - inactive_list: [] - run_once: true - -# - name: Get nodes from Cluster Switch -# block: -# - name: Get nodes from topology.conf -# shell: "cat /etc/slurm/topology.conf | grep \"SwitchName={{cluster_name}}\" | grep Nodes | awk '{ print $2}' | cut -c 7- | tr '\n' ',' | sed 's/,$/\\n/'" -# register: cluster_switch -# run_once: true -# delegate_to: 127.0.0.1 -# - name: Create existing cluster list -# set_fact: -# cluster_list: "{{cluster_switch.stdout.split(',')}}" -# run_once: true -# rescue: -# - name: Create existing cluster list -# set_fact: -# cluster_list: [] -# run_once: true +- name: Remove to the gres.conf file on the controller. + become: true + blockinfile: + path: "{{ slurm_conf_path }}/gres.conf" + marker: "### {mark} {{ansible_hostname}}" + delegate_to: 127.0.0.1 - name: Get hostnames set_fact: nodes_to_remove_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login'])}}" + with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring'])}}" run_once: true register: nodes_to_remove_temp_results - name: Make a list set_fact: nodes_to_remove="{{nodes_to_remove_temp_results.results | map(attribute='ansible_facts.nodes_to_remove_temp') | list}}" run_once: true - -- name: Get new inactive_nodes list - command: "scontrol show hostlistsorted {{inactive_list | union(nodes_to_remove) | join(',')}}" - register: new_inactive_list - run_once: true - delegate_to: 127.0.0.1 - -- name: Adding nodes to inactive - vars: - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" - become: true - lineinfile: - path: "{{ slurm_conf_path }}/topology.conf" - regexp: "SwitchName=inactive-{{queue}}-{{keyword}}\\sNodes.*" - line: "SwitchName=inactive-{{queue}}-{{keyword}} Nodes={{new_inactive_list.stdout }}" - state: present - run_once: true - delegate_to: 127.0.0.1 - name: Run the script to get the RackID shell: 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v1/host | jq .rackId' # shell: echo $RANDOM | md5sum | head -c 20 register: rackID_script + retries: 5 + delay: 5 + until: rackID_script is not failed + +- name: H100 Block + block: + - name: Run the script to get the RDMA Block ID + shell: 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v1/host/rdmaTopologyData/customerLocalBlock | grep ocid' + # shell: echo $RANDOM | md5sum | head -c 20 + register: blockID_script + retries: 3 + delay: 1 + ignore_errors: true + until: blockID_script is not failed + + - name: Set BlockID fact + set_fact: + rackID: "{{ blockID_script.stdout.split('.')[4][-16:-1]}}" + when: blockID_script is not failed -- name: Get RackID - set_fact: + - name: Set RackID fact + set_fact: + rackID: "{{ rackID_script.stdout[1:-41]}}" + when: blockID_script is failed + when: shape == 'BM.GPU.H100.8' + +- name: Set Rack ID fact on nodes other than H100 + set_fact: rackID: "{{ rackID_script.stdout[1:-41]}}" + when: shape != 'BM.GPU.H100.8' - name: Get rackIDs set_fact: racks_to_remove_temp: "{{cluster_name}}:{{hostvars[item]['rackID']}}" - with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login'])}}" + with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring'])}}" run_once: true register: racks_to_remove_temp_results diff --git a/playbooks/roles/slurm/tasks/destroy.yml b/playbooks/roles/slurm/tasks/destroy.yml index 7df264a6..eaa624a7 100755 --- a/playbooks/roles/slurm/tasks/destroy.yml +++ b/playbooks/roles/slurm/tasks/destroy.yml @@ -6,6 +6,13 @@ ignore_unreachable: True delegate_to: 127.0.0.1 +- name: Remove to the gres.conf file on the controller. + become: true + blockinfile: + path: "{{ slurm_conf_path }}/gres.conf" + marker: "### {mark} {{ansible_hostname}}" + delegate_to: 127.0.0.1 + - name: Get nodes from Cluster Switch block: - name: Get nodes from topology.conf @@ -26,36 +33,10 @@ set_fact: cluster_list: [] -- name: Get nodes from Inactive Switch - block: - - name: Get nodes from Inactive Switch - vars: - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" - shell: "cat {{ slurm_conf_path }}/topology.conf | grep \"SwitchName=inactive-{{queue}}-{{keyword}}\"" - register: inactive_switch_condensed - run_once: true - delegate_to: 127.0.0.1 - - name: Get inactive list - command: "scontrol show hostname {{inactive_switch_condensed.stdout.split('Nodes=')[1] }}" - register: inactive_switch - run_once: true - delegate_to: 127.0.0.1 - - name: Create inactive list - set_fact: - inactive_list: "{{inactive_switch.stdout_lines}}" - run_once: true - delegate_to: 127.0.0.1 - rescue: - - name: Create inactive cluster list - set_fact: - inactive_list: [] - run_once: true - delegate_to: 127.0.0.1 - - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) }}" + with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring']) }}" run_once: true register: nodes_to_add_temp_results @@ -91,27 +72,9 @@ delegate_to: 127.0.0.1 when: cluster_list | difference(nodes_to_add) | join(',') == '' -- name: Get new inactive_nodes list - command: "scontrol show hostlistsorted {{inactive_list | union(nodes_to_add) | join(',')}}" - register: new_inactive_list - run_once: true - delegate_to: 127.0.0.1 - -- name: Adding nodes to inactive - vars: - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" - become: true - lineinfile: - path: "{{ slurm_conf_path }}/topology.conf" - regexp: "SwitchName=inactive-{{queue}}-{{keyword}}\\sNodes.*" - line: "SwitchName=inactive-{{queue}}-{{keyword}} Nodes={{new_inactive_list.stdout}}" - state: present - run_once: true - delegate_to: 127.0.0.1 - - name: change Node Status become: true - command: "scontrol update nodename={{ ansible_hostname }} state=future reason=terminating" + command: "scontrol delete nodename={{ ansible_hostname }}" ignore_errors: true ignore_unreachable: True delegate_to: 127.0.0.1 diff --git a/playbooks/roles/slurm/tasks/el7.yml b/playbooks/roles/slurm/tasks/el7.yml index d9c15214..8afebd88 100755 --- a/playbooks/roles/slurm/tasks/el7.yml +++ b/playbooks/roles/slurm/tasks/el7.yml @@ -5,11 +5,11 @@ include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) -- name: run login server directives +- name: run login/monitoring server directives vars: slurm_repos: "epel,ol7_developer_EPEL" include_tasks: login.yml - when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + when: (('login' in group_names) or ('monitoring' in group_names) )and (not destroy|bool) and (initial| bool) - name: run backup server directives vars: diff --git a/playbooks/roles/slurm/tasks/el8.yml b/playbooks/roles/slurm/tasks/el8.yml index 1f5a2482..693df83e 100755 --- a/playbooks/roles/slurm/tasks/el8.yml +++ b/playbooks/roles/slurm/tasks/el8.yml @@ -5,11 +5,11 @@ include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) -- name: run login server directives +- name: run login/monitoring server directives vars: slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" include_tasks: login.yml - when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + when: (('login' in group_names) or ('monitoring' in group_names) ) and (not destroy|bool) and (initial| bool) - name: run backup server directives vars: diff --git a/playbooks/roles/slurm/tasks/login.yml b/playbooks/roles/slurm/tasks/login.yml index d68da67f..95e2732b 100755 --- a/playbooks/roles/slurm/tasks/login.yml +++ b/playbooks/roles/slurm/tasks/login.yml @@ -7,26 +7,26 @@ include_role: name: safe_yum -- name: Render systemd units for slurmd +- name: Render systemd units for sackd become: true template: - src: 'systemd/{{ item }}.service' + src: 'systemd/{{ item }}.service.j2' dest: '/lib/systemd/system/{{ item }}.service' backup: "yes" with_items: - - slurmd + - sackd when: ansible_os_family == 'Debian' -- name: Create systemd unit dirs for slurmd and munge +- name: Create systemd unit dirs for sackd and munge become: true file: name: '/etc/systemd/system/{{ item }}.service.d' state: directory with_items: - munge - - slurmd + - sackd -- name: Render systemd units for slurmd and munge +- name: Render systemd units for sackd and munge become: true template: src: 'systemd/{{ item }}.service.d/unit.conf.j2' @@ -34,7 +34,7 @@ backup: "yes" with_items: - munge - - slurmd + - sackd - name: Create munge dir become: true @@ -76,9 +76,9 @@ state: restarted enabled: true -- name: start slurmd +- name: start sackd become: true service: - name: slurmd + name: sackd state: restarted enabled: true \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/server.yml b/playbooks/roles/slurm/tasks/server.yml index 9610b527..28bd1d99 100755 --- a/playbooks/roles/slurm/tasks/server.yml +++ b/playbooks/roles/slurm/tasks/server.yml @@ -30,6 +30,14 @@ name: safe_yum when: ansible_os_family == 'Debian' +- name: install SLURM server packages Ubuntu + vars: + package_name: '{{ slurm_server_packages }}' + package_state: present + include_role: + name: safe_yum + when: ansible_os_family == 'Debian' + - name: install SLURM server packages RedHat vars: package_name: '{{ slurm_server_packages }}' @@ -99,7 +107,7 @@ - name: copy munge.key become: true shell: - cmd: cp /etc/munge/munge.key /tmp/munge.key + cmd: "cp {{ munge_conf_path }}/munge.key /tmp/munge.key" - name: set permissions become: true shell: @@ -139,7 +147,16 @@ owner: slurm group: slurm backup: yes - + +- name: Create empty GRES.conf + become: true + lineinfile: + path: "{{ slurm_conf_path }}/gres.conf" + regexp: "#GRES.conf" + line: "#GRES.conf" + state: present + create: true + - name: Generate slurm.conf become: true template: @@ -170,14 +187,6 @@ line: alias validate="python3 /opt/oci-hpc/scripts/validation.py" state: present -- name: Generate gres.conf - become: true - template: - src: gres.conf.j2 - dest: '{{ slurm_conf_path }}/gres.conf' - mode: '0644' - backup: yes - - name: Generate topology.conf become: true template: @@ -192,41 +201,5 @@ run_once: true notify: reconfigure slurm - - name: run handlers - meta: flush_handlers - -- name: Generate a list of types to check - vars: - temp_list: "{% for partition in queues %}{% for instancetype in partition.instance_types %}{{partition.name}}-{{instancetype.instance_keyword}},{% endfor %}{% endfor %}" - set_fact: - nodesname_list: "{{temp_list.split(',')[:-1] }}" - when: not initial_topology.changed - -- name: Check if shapes need to be added - become: true - lineinfile: - path: "{{ slurm_conf_path }}/topology.conf" - regexp: "SwitchName=inactive-{{item}}\\sNodes.*" - state: absent - check_mode: yes - with_items: "{{nodesname_list}}" - run_once: true - delegate_to: 127.0.0.1 - register: shape_added - when: not initial_topology.changed - -- name: Add new shapes to existing topology.conf - become: true - vars: - size: "{{ hostvars[inventory_hostname]['private_subnet'] | ipaddr('size') }}" - lineinfile: - path: "{{ slurm_conf_path }}/topology.conf" - regexp: "SwitchName=inactive-{{item.item}}\\sNodes.*" - line: "SwitchName=inactive-{{item.item}} Nodes={{item.item}}-node-[1-{{size}}]" - state: present - with_items: "{{shape_added.results}}" - run_once: true - delegate_to: 127.0.0.1 - when: not initial_topology.changed and not ( item.changed | bool) - notify: reconfigure slurm \ No newline at end of file + meta: flush_handlers \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/ubuntu.yml b/playbooks/roles/slurm/tasks/ubuntu.yml index 5399d7cc..e3533b75 100644 --- a/playbooks/roles/slurm/tasks/ubuntu.yml +++ b/playbooks/roles/slurm/tasks/ubuntu.yml @@ -10,9 +10,9 @@ include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) -- name: run login server directives +- name: run login/monitoring server directives include_tasks: login.yml - when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + when: (('login' in group_names) or ('monitoring' in group_names) ) and (not destroy|bool) and (initial| bool) - name: run backup server directives include_tasks: backup_server.yml diff --git a/playbooks/roles/slurm/templates/gres.conf.j2 b/playbooks/roles/slurm/templates/gres.conf.j2 index 5010eb10..0694d498 100644 --- a/playbooks/roles/slurm/templates/gres.conf.j2 +++ b/playbooks/roles/slurm/templates/gres.conf.j2 @@ -1,47 +1,65 @@ -{% set size = hostvars[inventory_hostname]['private_subnet'] | ipaddr('size')%} -{% for partition in queues %} -{% for instance in partition.instance_types %} -{% if instance.shape == "BM.GPU2.2"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-13] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia1 Type=P100 Cores=[14-27] -{% elif instance.shape == "VM.GPU2.1"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-11] -{% elif instance.shape == "VM.GPU3.1"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=V100 Cores=[0-5] -{% elif instance.shape == "VM.GPU3.2"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=V100 Cores=[0-11] -{% elif instance.shape == "VM.GPU3.4"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-23] -{% elif instance.shape == "BM.GPU3.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-25] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-7] Type=V100 Cores=[26-51] -{% elif instance.shape == "BM.GPU4.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[24-31] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[8-15] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[56-63] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[40-47] -{% elif instance.shape == "BM.GPU.B4.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] -{% elif instance.shape == "BM.GPU.A100-v2.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] -{% elif instance.shape == "BM.GPU.H100.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=H100 Cores=[48-55] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-7] Type=H100 Cores=[56-111] -{% elif instance.shape == "BM.GPU.T1.2" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] -{% elif instance.shape == "BM.GPU.A10.4" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A10 Cores=[32-63] -{% elif instance.shape == "VM.GPU.A10.2" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-29] -{% elif instance.shape == "VM.GPU.A10.1" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=A10 Cores=[0-14] +{% if shape== "BM.GPU2.2"%} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-13] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia1 Type=P100 Cores=[14-27] +{% elif shape == "VM.GPU2.1"%} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-11] +{% elif shape == "VM.GPU3.1"%} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia0 Type=V100 Cores=[0-5] +{% elif shape == "VM.GPU3.2"%} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-1] Type=V100 Cores=[0-11] +{% elif shape == "VM.GPU3.4"%} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-23] +{% elif shape == "BM.GPU3.8"%} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-25] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[4-7] Type=V100 Cores=[26-51] +{% elif shape == "BM.GPU4.8"%} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[24-31] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[8-15] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[56-63] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[40-47] +{% elif shape == "BM.GPU.B4.8"%} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] +{% elif shape == "BM.GPU.A100-v2.8"%} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] +{% elif shape == "BM.GPU.H100.8"%} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-3] Type=H100 Cores=[0-55] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[4-7] Type=H100 Cores=[56-111] +{% elif shape == "BM.GPU.MI300X.8"%} +NodeName={{ansible_hostname}} Name=gpu File=/dev/dri/renderD[128,136,144,152,160,168,176,184] Type=MI300X +{% elif shape == "BM.GPU.L40S.4"%} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-1] Type=L40S Cores=[0-55] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[2-3] Type=L40S Cores=[56-111] +{% elif shape == "BM.GPU.T1.2" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] +{% elif shape == "BM.GPU.A10.4" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[2-3] Type=A10 Cores=[32-63] +{% elif shape == "VM.GPU.A10.2" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-29] +{% elif shape == "VM.GPU.A10.1" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia0 Type=A10 Cores=[0-14] +{% elif shape == "VM.GPU.A100.40G.1" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia0 Type=A100 Cores=[0-6] +{% elif shape == "VM.GPU.A100.40G.2" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[0-13] +{% elif shape == "VM.GPU.A100.40G.4" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-3] Type=A100 Cores=[0-27] +{% elif shape == "VM.GPU.A100.B40G.1" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia0 Type=A100 Cores=[0-14] +{% elif shape == "VM.GPU.A100.B40G.2" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[0-29] +{% elif shape == "VM.GPU.A100.B40G.4" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-3] Type=A100 Cores=[0-59] +{% elif shape == "VM.GPU.A100.80G.1" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia0 Type=A100 Cores=[0-14] +{% elif shape == "VM.GPU.A100.80G.2" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[0-29] +{% elif shape == "VM.GPU.A100.80G.4" %} +NodeName={{ansible_hostname}} Name=gpu File=/dev/nvidia[0-3] Type=A100 Cores=[0-59] {% endif %} -{% endfor %} -{% endfor %} \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index 0ea9259f..be583b7c 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -13,7 +13,8 @@ SlurmdPort=6818 SlurmdSpoolDir=/var/spool/slurmd SlurmUser=slurm {% if pyxis|bool or healthchecks|bool%} -Prolog=/etc/slurm/prolog.d/* +Prolog={{slurm_conf_path}}/prolog.d/* +SchedulerParameters=nohold_on_prolog_fail {% endif %} SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdLogFile=/var/log/slurm/slurmd.log @@ -41,15 +42,16 @@ SlurmctldDebug=info SlurmdDebug=info EnforcePartLimits=NO PropagateResourceLimitsExcept=MEMLOCK -CommunicationParameters=NoAddrCache +CommunicationParameters=block_null_hash TopologyPlugin=topology/tree TopologyParam=SwitchAsNodeRank TreeWidth=2048 SlurmctldParameters=enable_configless +MaxNodeCount=10000 {% if healthchecks|bool %} -HealthCheckProgram=/etc/slurm/prolog.d/healthchecks.sh +HealthCheckProgram={{slurm_conf_path}}/prolog.d/healthchecks.sh HealthCheckInterval=300 -HealthCheckNodeState=NONDRAINED_IDLE,CYCLE +HealthCheckNodeState=IDLE,CYCLE {% endif %} {% if sacct_limits|bool %} AccountingStorageTRES=gres/gpu @@ -58,104 +60,9 @@ JobCompType=jobcomp/none TrackWckey=no {% endif %} - -{% if (groups['login']| length ) > 0 %} -NodeName={{ hostvars[groups['login'][0]]['ansible_fqdn'].split('.')[0] }} -{% endif %} - {% for partition in queues %} {% for instance in partition.instance_types %} -{% set size = instance.private_subnet | ipaddr('size')%} -{% if instance.hyperthreading | bool %} -{% set threadspercore = 2 %} -{% else %} -{% set threadspercore = 1 %} -{% endif %} -{% if instance.shape == "BM.GPU2.2"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=14 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:P100:2 -{% elif instance.shape == "VM.GPU2.1" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=12 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:P100:1 -{% elif instance.shape == "VM.GPU3.1" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=6 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:V100:1 -{% elif instance.shape == "VM.GPU3.2" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=12 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:V100:2 -{% elif instance.shape == "VM.GPU3.4" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=24 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:V100:4 -{% elif instance.shape == "BM.GPU3.8" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=26 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:V100:8 -{% elif instance.shape == "BM.GPU4.8" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A100:8 -{% elif instance.shape == "BM.GPU.B4.8" and threadspercore == 1 %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=8 CoresPerSocket=16 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A100:8 -{% elif instance.shape == "BM.GPU.B4.8" and threadspercore == 2 %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=255 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A100:8 -{% elif instance.shape == "BM.GPU.A100-v2.8" and threadspercore == 1 %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=8 CoresPerSocket=16 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A100:8 -{% elif instance.shape == "BM.GPU.A100-v2.8" and threadspercore == 2 %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=255 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A100:8 -{% elif instance.shape == "BM.GPU.H100.8" and threadspercore == 1 %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=56 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:H100:8 -{% elif instance.shape == "BM.GPU.H100.8" and threadspercore == 2 %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=56 ThreadsPerCore=2 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:H100:8 -{% elif instance.shape == "BM.GPU.T1.2" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:2 -{% elif instance.shape == "BM.GPU.A10.4" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:4 -{% elif instance.shape == "VM.GPU.A10.2" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=30 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:2 -{% elif instance.shape == "VM.GPU.A10.1" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=15 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:1 -{% elif instance.shape == "VM.Standard.E3.Flex" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "VM.Standard.E4.Flex" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "VM.Standard.E5.Flex" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "VM.Optimized3.Flex" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "VM.Standard3.Flex" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "VM.DenseIO.E4.Flex" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "VM.DenseIO.E5.Flex" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "VM.Standard.A1.Flex" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.Standard.E3.128" and threadspercore == 1%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.Standard.E3.128" and threadspercore == 2%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=255 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.Standard.E4.128" and threadspercore == 1 %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.Standard.E4.128" and threadspercore == 2 %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=255 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.DenseIO.E4.128" and threadspercore == 1 %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.DenseIO.E4.128" and threadspercore == 2 %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=255 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.DenseIO.E5.128" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.HPC2.36" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.HPC.E5.144" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=72 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.Standard.E5.192" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=96 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.Optimized3.36" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif "VM.Standard2." in instance.shape %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{ instance.shape.split('.')[-1]|int }} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.Standard2.52" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=26 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.Standard3.64" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif "VM.Standard.E2." in instance.shape %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{ instance.shape.split('.')[-1]|int }} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.Standard.E2.64" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.Standard.A1.160" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=80 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% endif %} +Nodeset={{instance.name}} Feature={{instance.name}} {% endfor %} {% endfor %} @@ -163,16 +70,14 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar {% if partition.default %} {% set nodesList = [] %} {% for instance in partition.instance_types %} -{% set size = instance.private_subnet | ipaddr('size')%} -{{ nodesList.append(partition.name+'-'+instance.instance_keyword+'-node-[1-'+size|string+']')}} +{{ nodesList.append(instance.name)}} {%- endfor %} -PartitionName={{partition.name}} Nodes={{nodesList|join(',')}} Default=YES MaxTime=INFINITE State=UP +PartitionName={{partition.name}} Nodes={{nodesList|join(',')}} Default=YES {% else %} {% set nodesList = [] %} {% for instance in partition.instance_types %} -{% set size = instance.private_subnet | ipaddr('size')%} -{{ nodesList.append(partition.name+'-'+instance.instance_keyword+'-node-[1-'+size|string+']')}} +{{ nodesList.append(instance.name)}} {%- endfor %} -PartitionName={{partition.name}} Nodes={{nodesList|join(',')}} Default=NO MaxTime=INFINITE State=UP +PartitionName={{partition.name}} Nodes={{nodesList|join(',')}} Default=NO {% endif %} {% endfor %} \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/systemd/sackd.service.d/unit.conf.j2 b/playbooks/roles/slurm/templates/systemd/sackd.service.d/unit.conf.j2 new file mode 100755 index 00000000..61348b4c --- /dev/null +++ b/playbooks/roles/slurm/templates/systemd/sackd.service.d/unit.conf.j2 @@ -0,0 +1,7 @@ +[Unit] +Requires=munge.service + +[Service] +Restart=always +ExecStart= +ExecStart={{slurm_exec}}/sbin/sackd --systemd --conf-server {{ hostvars[groups['controller'][0]]['ansible_fqdn'].split('.')[0] }}{% if (groups['slurm_backup']| length ) > 0 %},{{ hostvars[groups['slurm_backup'][0]]['ansible_fqdn'].split('.')[0] }}{% endif %} $SACKD_OPTIONS \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/systemd/sackd.service.j2 b/playbooks/roles/slurm/templates/systemd/sackd.service.j2 new file mode 100644 index 00000000..4c05fa18 --- /dev/null +++ b/playbooks/roles/slurm/templates/systemd/sackd.service.j2 @@ -0,0 +1,28 @@ +[Unit] +Description=Slurm auth and cred kiosk daemon +After=network-online.target remote-fs.target +Wants=network-online.target + +[Service] +Type=notify +EnvironmentFile=-/etc/sysconfig/sackd +EnvironmentFile=-/etc/default/sackd +User=slurm +Group=slurm +RuntimeDirectory=slurm +RuntimeDirectoryMode=0755 +ExecStart={{slurm_exec}}/sbin/sackd --systemd --conf-server {{ hostvars[groups['controller'][0]]['ansible_fqdn'].split('.')[0] }}{% if (groups['slurm_backup']| length ) > 0 %},{{ hostvars[groups['slurm_backup'][0]]['ansible_fqdn'].split('.')[0] }}{% endif %} $SACKD_OPTIONS +ExecReload=/bin/kill -HUP $MAINPID +KillMode=process +LimitNOFILE=131072 +LimitMEMLOCK=infinity +LimitSTACK=infinity +TasksMax=infinity + +# Uncomment the following lines to disable logging through journald. +# NOTE: It may be preferable to set these through an override file instead. +#StandardOutput=null +#StandardError=null + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/systemd/slurmctld.service.j2 b/playbooks/roles/slurm/templates/systemd/slurmctld.service.j2 index c9fa73d4..9de6957f 100644 --- a/playbooks/roles/slurm/templates/systemd/slurmctld.service.j2 +++ b/playbooks/roles/slurm/templates/systemd/slurmctld.service.j2 @@ -1,7 +1,7 @@ [Unit] Description=Slurm controller daemon After=network.target munge.service -ConditionPathExists=/etc/slurm/slurm.conf +ConditionPathExists={{slurm_conf_path}}/slurm.conf Documentation=man:slurmctld(8) [Service] diff --git a/playbooks/roles/slurm/templates/systemd/slurmd.service b/playbooks/roles/slurm/templates/systemd/slurmd.service deleted file mode 100644 index 534afe2c..00000000 --- a/playbooks/roles/slurm/templates/systemd/slurmd.service +++ /dev/null @@ -1,20 +0,0 @@ -[Unit] -Description=Slurm node daemon -After=munge.service network.target remote-fs.target -Documentation=man:slurmd(8) - -[Service] -Type=forking -EnvironmentFile=-/etc/default/slurm -ExecStart={{slurm_exec}}/sbin/slurmd --conf-server {{ hostvars[groups['controller'][0]]['ansible_fqdn'].split('.')[0] }}{% if (groups['slurm_backup']| length ) > 0 %},{{ hostvars[groups['slurm_backup'][0]]['ansible_fqdn'].split('.')[0] }}{% endif %} $SLURMD_OPTIONS -ExecReload=/bin/kill -HUP $MAINPID -PIDFile=/run/slurmd.pid -KillMode=process -LimitNOFILE=131072 -LimitMEMLOCK=infinity -LimitSTACK=infinity -Delegate=yes -TasksMax=infinity - -[Install] -WantedBy=multi-user.target \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/systemd/slurmd.service.d/unit.conf.j2 b/playbooks/roles/slurm/templates/systemd/slurmd.service.d/unit.conf.j2 index 7f4faf67..b62644bf 100755 --- a/playbooks/roles/slurm/templates/systemd/slurmd.service.d/unit.conf.j2 +++ b/playbooks/roles/slurm/templates/systemd/slurmd.service.d/unit.conf.j2 @@ -1,9 +1,49 @@ + [Unit] Requires=munge.service [Service] Restart=always -{% if ansible_os_family == 'RedHat' %} ExecStart= -ExecStart={{slurm_exec}}/sbin/slurmd --conf-server {{ hostvars[groups['controller'][0]]['ansible_fqdn'].split('.')[0] }}{% if (groups['slurm_backup']| length ) > 0 %},{{ hostvars[groups['slurm_backup'][0]]['ansible_fqdn'].split('.')[0] }}{% endif %} -D $SLURMD_OPTIONS -{% endif %} \ No newline at end of file +{% if shape == "BM.GPU2.2"%} +{% set gres = "Gres=gpu:P100:2" %} +{% elif shape == "VM.GPU2.1" %} +{% set gres = "Gres=gpu:P100:1" %} +{% elif shape == "VM.GPU3.1" %} +{% set gres = "Gres=gpu:V100:1" %} +{% elif shape == "VM.GPU3.2" %} +{% set gres = "Gres=gpu:V100:2" %} +{% elif shape == "VM.GPU3.4" %} +{% set gres = "Gres=gpu:V100:4" %} +{% elif shape == "BM.GPU3.8" %} +{% set gres = "Gres=gpu:V100:8" %} +{% elif shape == "BM.GPU4.8" %} +{% set gres = "Gres=gpu:A100:8" %} +{% elif shape == "BM.GPU.B4.8" %} +{% set gres = "Gres=gpu:A100:8" %} +{% elif shape == "BM.GPU.A100-v2.8" %} +{% set gres = "Gres=gpu:A100:8" %} +{% elif shape == "BM.GPU.H100.8" %} +{% set gres = "Gres=gpu:H100:8" %} +{% elif shape == "BM.GPU.T1.2" %} +{% set gres = "Gres=gpu:A10:2" %} +{% elif shape == "BM.GPU.A10.4" %} +{% set gres = "Gres=gpu:A10:4" %} +{% elif shape == "VM.GPU.A10.2" %} +{% set gres = "Gres=gpu:A10:2" %} +{% elif shape == "VM.GPU.A10.1" %} +{% set gres = "Gres=gpu:A10:1" %} +{% elif shape == "BM.GPU.MI300X.8" %} +{% set gres = "Gres=gpu:MI300X:8" %} +{% elif shape == "BM.GPU.L40S.4" %} +{% set gres = "Gres=gpu:L40S:4" %} +{% elif shape == "VM.GPU.A100.40G.1" or shape == "VM.GPU.A100.B40G.1" or shape == "VM.GPU.A100.80G.1"%} +{% set gres = "Gres=gpu:A100:1" %} +{% elif shape == "VM.GPU.A100.40G.2" or shape == "VM.GPU.A100.B40G.2" or shape == "VM.GPU.A100.80G.2"%} +{% set gres = "Gres=gpu:A100:2" %} +{% elif shape == "VM.GPU.A100.40G.4" or shape == "VM.GPU.A100.B40G.4" or shape == "VM.GPU.A100.80G.4"%} +{% set gres = "Gres=gpu:A100:4" %} +{% else %} +{% set gres = "" %} +{% endif %} +ExecStart={{slurm_exec}}/sbin/slurmd --systemd -Z --conf "{{gres}} Feature={{instance_type}},CN__{{cluster_name}}" --conf-server {{ hostvars[groups['controller'][0]]['ansible_fqdn'].split('.')[0] }}{% if (groups['slurm_backup']| length ) > 0 %},{{ hostvars[groups['slurm_backup'][0]]['ansible_fqdn'].split('.')[0] }}{% endif %} $SLURMD_OPTIONS \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/systemd/slurmd.service.j2 b/playbooks/roles/slurm/templates/systemd/slurmd.service.j2 new file mode 100644 index 00000000..269207c9 --- /dev/null +++ b/playbooks/roles/slurm/templates/systemd/slurmd.service.j2 @@ -0,0 +1,27 @@ +[Unit] +Description=Slurm node daemon +After=munge.service network-online.target remote-fs.target sssd.service +Wants=network-online.target + +[Service] +Type=notify +EnvironmentFile=-/etc/sysconfig/slurmd +EnvironmentFile=-/etc/default/slurmd +RuntimeDirectory=slurm +RuntimeDirectoryMode=0755 +ExecStart={{slurm_exec}}/sbin/slurmd --systemd --conf-server {{ hostvars[groups['controller'][0]]['ansible_fqdn'].split('.')[0] }}{% if (groups['slurm_backup']| length ) > 0 %},{{ hostvars[groups['slurm_backup'][0]]['ansible_fqdn'].split('.')[0] }}{% endif %} $SLURMD_OPTIONS +ExecReload=/bin/kill -HUP $MAINPID +KillMode=process +LimitNOFILE=131072 +LimitMEMLOCK=infinity +LimitSTACK=infinity +Delegate=yes +TasksMax=infinity + +# Uncomment the following lines to disable logging through journald. +# NOTE: It may be preferable to set these through an override file instead. +#StandardOutput=null +#StandardError=null + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/systemd/slurmdbd.service.j2 b/playbooks/roles/slurm/templates/systemd/slurmdbd.service.j2 index f856944a..0b026b7c 100644 --- a/playbooks/roles/slurm/templates/systemd/slurmdbd.service.j2 +++ b/playbooks/roles/slurm/templates/systemd/slurmdbd.service.j2 @@ -1,7 +1,7 @@ [Unit] Description=Slurm DBD accounting daemon After=network.target munge.service -ConditionPathExists=/etc/slurm/slurmdbd.conf +ConditionPathExists={{slurm_conf_path}}/slurmdbd.conf Documentation=man:slurmdbd(8) [Service] diff --git a/playbooks/roles/slurm/templates/topology.conf.j2 b/playbooks/roles/slurm/templates/topology.conf.j2 index 66654ab1..a8c15890 100644 --- a/playbooks/roles/slurm/templates/topology.conf.j2 +++ b/playbooks/roles/slurm/templates/topology.conf.j2 @@ -1,10 +1 @@ -### Topology File -{% if (groups['login']| length ) > 0 %} -SwitchName=login-node Nodes={{ hostvars[groups['login'][0]]['ansible_fqdn'].split('.')[0] }} -{% endif %} -{% for partition in queues %} -{% for instance in partition.instance_types %} -{% set size = instance.private_subnet | ipaddr('size')%} -SwitchName=inactive-{{partition.name}}-{{instance.instance_keyword}} Nodes={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] -{% endfor %} -{% endfor %} \ No newline at end of file +### Topology File \ No newline at end of file diff --git a/playbooks/roles/slurm/vars/centos_vars.yml b/playbooks/roles/slurm/vars/centos_vars.yml index ed17b849..82a55931 100644 --- a/playbooks/roles/slurm/vars/centos_vars.yml +++ b/playbooks/roles/slurm/vars/centos_vars.yml @@ -23,6 +23,8 @@ slurm_all_packages: - "slurm-centos-pam_slurm-{{slurm_version}}.el7.x86_64.rpm" - "slurm-centos-libpmi-{{slurm_version}}.el7.x86_64.rpm" - "slurm-centos-slurmd-{{slurm_version}}.el7.x86_64.rpm" + - "slurm-centos-sackd-{{slurm_version}}.el7.x86_64.rpm" + - "slurm-centos-slurmrestd-{{slurm_version}}.el7.x86_64.rpm" slurm_common_packages: - "{{ download_path }}/slurm_rpms/slurm-centos-{{slurm_version}}.el7.x86_64.rpm" @@ -36,6 +38,7 @@ slurm_common_packages: slurm_server_packages: - "{{ download_path }}/slurm_rpms/slurm-centos-slurmctld-{{slurm_version}}.el7.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-centos-slurmdbd-{{slurm_version}}.el7.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-centos-slurmrestd-{{slurm_version}}.el7.x86_64.rpm" slurm_compute_packages: - "{{ download_path }}/slurm_rpms/slurm-centos-pam_slurm-{{slurm_version}}.el7.x86_64.rpm" @@ -51,7 +54,7 @@ slurm_backup_server_packages: slurm_login_packages: - "{{ download_path }}/slurm_rpms/slurm-centos-pam_slurm-{{slurm_version}}.el7.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-centos-libpmi-{{slurm_version}}.el7.x86_64.rpm" - - "{{ download_path }}/slurm_rpms/slurm-centos-slurmd-{{slurm_version}}.el7.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-centos-sackd-{{slurm_version}}.el7.x86_64.rpm" pmix_download_packages: diff --git a/playbooks/roles/slurm/vars/el_vars.yml b/playbooks/roles/slurm/vars/el_vars.yml index d7149d6e..ab9b9fde 100644 --- a/playbooks/roles/slurm/vars/el_vars.yml +++ b/playbooks/roles/slurm/vars/el_vars.yml @@ -23,6 +23,8 @@ slurm_all_packages: - "slurm-pam_slurm-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" - "slurm-libpmi-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" - "slurm-slurmd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "slurm-sackd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "slurm-slurmrestd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" slurm_common_packages: - "{{ download_path }}/slurm_rpms/slurm-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" @@ -35,6 +37,7 @@ slurm_common_packages: slurm_server_packages: - "{{ download_path }}/slurm_rpms/slurm-slurmctld-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-slurmdbd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-slurmrestd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" slurm_compute_packages: - "{{ download_path }}/slurm_rpms/slurm-pam_slurm-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" @@ -50,7 +53,7 @@ slurm_backup_server_packages: slurm_login_packages: - "{{ download_path }}/slurm_rpms/slurm-pam_slurm-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-libpmi-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" - - "{{ download_path }}/slurm_rpms/slurm-slurmd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-sackd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" pmix_download_packages: diff --git a/playbooks/roles/slurm/vars/ubuntu_vars.yml b/playbooks/roles/slurm/vars/ubuntu_vars.yml index c820e9b8..46172537 100644 --- a/playbooks/roles/slurm/vars/ubuntu_vars.yml +++ b/playbooks/roles/slurm/vars/ubuntu_vars.yml @@ -14,7 +14,8 @@ munge_packages: slurm_common_packages: [] -slurm_server_packages: [] +slurm_server_packages: + - libjwt-dev slurm_compute_packages: - libpmi0 diff --git a/playbooks/roles/telegraf/files/telegraf.conf b/playbooks/roles/telegraf/files/telegraf.conf index 5e4074b2..bd0e15f7 100755 --- a/playbooks/roles/telegraf/files/telegraf.conf +++ b/playbooks/roles/telegraf/files/telegraf.conf @@ -25,7 +25,7 @@ # Configuration for telegraf agent [agent] ## Default data collection interval for all inputs - interval = "10s" + interval = "30s" ## Rounds collection interval to 'interval' ## ie, if interval="10s" then always collect on :00, :10, :20, etc. round_interval = true diff --git a/playbooks/roles/telegraf/files/telegraf_amd_gpu.conf b/playbooks/roles/telegraf/files/telegraf_amd_gpu.conf new file mode 100755 index 00000000..a6496895 --- /dev/null +++ b/playbooks/roles/telegraf/files/telegraf_amd_gpu.conf @@ -0,0 +1,7023 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply surround +# them with ${}. For strings the variable must be within quotes (ie, "${STR_VAR}"), +# for numbers and booleans they should be plain (ie, ${INT_VAR}, ${BOOL_VAR}) + + +# Global tags can be specified here in key="value" format. +[global_tags] + # dc = "us-east-1" # will tag all metrics with dc=us-east-1 + # rack = "1a" + ## Environment variables can be used as tags, and throughout the config file + # user = "$USER" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "30s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## Maximum number of unwritten metrics per output. Increasing this value + ## allows for longer periods of output downtime without dropping metrics at the + ## cost of higher maximum memory usage. + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. Maximum flush_interval will be + ## flush_interval + flush_jitter + flush_interval = "10s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Log at debug level. + # debug = false + ## Log only error level messages. + # quiet = false + + ## Log target controls the destination for logs and can be one of "file", + ## "stderr" or, on Windows, "eventlog". When set to "file", the output file + ## is determined by the "logfile" setting. + # logtarget = "file" + + ## Name of the file to be logged to when using the "file" logtarget. If set to + ## the empty string then logs are written to stderr. + # logfile = "" + + ## The logfile will be rotated after the time interval specified. When set + ## to 0 no time based rotation is performed. Logs are rotated only when + ## written to, if there is no log activity rotation may be delayed. + # logfile_rotation_interval = "0d" + + ## The logfile will be rotated when it becomes larger than the specified + ## size. When set to 0 no size based rotation is performed. + # logfile_rotation_max_size = "0MB" + + ## Maximum number of rotated archives to keep, any older logs are deleted. + ## If set to -1, no archives are removed. + # logfile_rotation_max_archives = 5 + + ## Override default hostname, if empty use os.Hostname() + hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = false + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + + +# Configuration for sending metrics to InfluxDB +#[[outputs.influxdb]] + ## The full HTTP or UDP URL for your InfluxDB instance. + ## + ## Multiple URLs can be specified for a single cluster, only ONE of the + ## urls will be written to each interval. + # urls = ["unix:///var/run/influxdb.sock"] + # urls = ["udp://127.0.0.1:8089"] + # urls = ["http://127.0.0.1:8086"] + + ## The target database for metrics; will be created as needed. + ## For UDP url endpoint database needs to be configured on server side. + # database = "telegraf" + + ## The value of this tag will be used to determine the database. If this + ## tag is not set the 'database' option is used as the default. + # database_tag = "" + + ## If true, the 'database_tag' will not be included in the written metric. + # exclude_database_tag = false + + ## If true, no CREATE DATABASE queries will be sent. Set to true when using + ## Telegraf with a user without permissions to create databases or when the + ## database already exists. + # skip_database_creation = false + + ## Name of existing retention policy to write to. Empty string writes to + ## the default retention policy. Only takes effect when using HTTP. + # retention_policy = "" + + ## The value of this tag will be used to determine the retention policy. If this + ## tag is not set the 'retention_policy' option is used as the default. + # retention_policy_tag = "" + + ## If true, the 'retention_policy_tag' will not be included in the written metric. + # exclude_retention_policy_tag = false + + ## Write consistency (clusters only), can be: "any", "one", "quorum", "all". + ## Only takes effect when using HTTP. + # write_consistency = "any" + + ## Timeout for HTTP messages. + # timeout = "5s" + + ## HTTP Basic Auth + # username = "telegraf" + # password = "metricsmetricsmetricsmetrics" + + ## HTTP User-Agent + # user_agent = "telegraf" + + ## UDP payload size is the maximum packet size to send. + # udp_payload = "512B" + + ## Optional TLS Config for use on HTTP connections. + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## HTTP Proxy override, if unset values the standard proxy environment + ## variables are consulted to determine which proxy, if any, should be used. + # http_proxy = "http://corporate.proxy:3128" + + ## Additional HTTP headers + # http_headers = {"X-Special-Header" = "Special-Value"} + + ## HTTP Content-Encoding for write request body, can be set to "gzip" to + ## compress body or "identity" to apply no encoding. + # content_encoding = "identity" + + ## When true, Telegraf will output unsigned integers as unsigned values, + ## i.e.: "42u". You will need a version of InfluxDB supporting unsigned + ## integer values. Enabling this option will result in field type errors if + ## existing data has been written. + # influx_uint_support = false + + +# # Configuration for Amon Server to send metrics to. +# [[outputs.amon]] +# ## Amon Server Key +# server_key = "my-server-key" # required. +# +# ## Amon Instance URL +# amon_instance = "https://youramoninstance" # required +# +# ## Connection timeout. +# # timeout = "5s" + + +# # Publishes metrics to an AMQP broker +# [[outputs.amqp]] +# ## Broker to publish to. +# ## deprecated in 1.7; use the brokers option +# # url = "amqp://localhost:5672/influxdb" +# +# ## Brokers to publish to. If multiple brokers are specified a random broker +# ## will be selected anytime a connection is established. This can be +# ## helpful for load balancing when not using a dedicated load balancer. +# brokers = ["amqp://localhost:5672/influxdb"] +# +# ## Maximum messages to send over a connection. Once this is reached, the +# ## connection is closed and a new connection is made. This can be helpful for +# ## load balancing when not using a dedicated load balancer. +# # max_messages = 0 +# +# ## Exchange to declare and publish to. +# exchange = "telegraf" +# +# ## Exchange type; common types are "direct", "fanout", "topic", "header", "x-consistent-hash". +# # exchange_type = "topic" +# +# ## If true, exchange will be passively declared. +# # exchange_passive = false +# +# ## Exchange durability can be either "transient" or "durable". +# # exchange_durability = "durable" +# +# ## Additional exchange arguments. +# # exchange_arguments = { } +# # exchange_arguments = {"hash_property" = "timestamp"} +# +# ## Authentication credentials for the PLAIN auth_method. +# # username = "" +# # password = "" +# +# ## Auth method. PLAIN and EXTERNAL are supported +# ## Using EXTERNAL requires enabling the rabbitmq_auth_mechanism_ssl plugin as +# ## described here: https://www.rabbitmq.com/plugins.html +# # auth_method = "PLAIN" +# +# ## Metric tag to use as a routing key. +# ## ie, if this tag exists, its value will be used as the routing key +# # routing_tag = "host" +# +# ## Static routing key. Used when no routing_tag is set or as a fallback +# ## when the tag specified in routing tag is not found. +# # routing_key = "" +# # routing_key = "telegraf" +# +# ## Delivery Mode controls if a published message is persistent. +# ## One of "transient" or "persistent". +# # delivery_mode = "transient" +# +# ## InfluxDB database added as a message header. +# ## deprecated in 1.7; use the headers option +# # database = "telegraf" +# +# ## InfluxDB retention policy added as a message header +# ## deprecated in 1.7; use the headers option +# # retention_policy = "default" +# +# ## Static headers added to each published message. +# # headers = { } +# # headers = {"database" = "telegraf", "retention_policy" = "default"} +# +# ## Connection timeout. If not provided, will default to 5s. 0s means no +# ## timeout (not recommended). +# # timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## If true use batch serialization format instead of line based delimiting. +# ## Only applies to data formats which are not line based such as JSON. +# ## Recommended to set to true. +# # use_batch_format = false +# +# ## Content encoding for message payloads, can be set to "gzip" to or +# ## "identity" to apply no encoding. +# ## +# ## Please note that when use_batch_format = false each amqp message contains only +# ## a single metric, it is recommended to use compression with batch format +# ## for best results. +# # content_encoding = "identity" +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# # data_format = "influx" + + +# # Send metrics to Azure Application Insights +# [[outputs.application_insights]] +# ## Instrumentation key of the Application Insights resource. +# instrumentation_key = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxx" +# +# ## Regions that require endpoint modification https://docs.microsoft.com/en-us/azure/azure-monitor/app/custom-endpoints +# # endpoint_url = "https://dc.services.visualstudio.com/v2/track" +# +# ## Timeout for closing (default: 5s). +# # timeout = "5s" +# +# ## Enable additional diagnostic logging. +# # enable_diagnostic_logging = false +# +# ## Context Tag Sources add Application Insights context tags to a tag value. +# ## +# ## For list of allowed context tag keys see: +# ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go +# # [outputs.application_insights.context_tag_sources] +# # "ai.cloud.role" = "kubernetes_container_name" +# # "ai.cloud.roleInstance" = "kubernetes_pod_name" + + +# # Send aggregate metrics to Azure Monitor +# [[outputs.azure_monitor]] +# ## Timeout for HTTP writes. +# # timeout = "20s" +# +# ## Set the namespace prefix, defaults to "Telegraf/". +# # namespace_prefix = "Telegraf/" +# +# ## Azure Monitor doesn't have a string value type, so convert string +# ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows +# ## a maximum of 10 dimensions so Telegraf will only send the first 10 +# ## alphanumeric dimensions. +# # strings_as_dimensions = false +# +# ## Both region and resource_id must be set or be available via the +# ## Instance Metadata service on Azure Virtual Machines. +# # +# ## Azure Region to publish metrics against. +# ## ex: region = "southcentralus" +# # region = "" +# # +# ## The Azure Resource ID against which metric will be logged, e.g. +# ## ex: resource_id = "/subscriptions//resourceGroups//providers/Microsoft.Compute/virtualMachines/" +# # resource_id = "" +# +# ## Optionally, if in Azure US Government, China or other sovereign +# ## cloud environment, set appropriate REST endpoint for receiving +# ## metrics. (Note: region may be unused in this context) +# # endpoint_url = "https://monitoring.core.usgovcloudapi.net" + + +# # Publish Telegraf metrics to a Google Cloud PubSub topic +# [[outputs.cloud_pubsub]] +# ## Required. Name of Google Cloud Platform (GCP) Project that owns +# ## the given PubSub topic. +# project = "my-project" +# +# ## Required. Name of PubSub topic to publish metrics to. +# topic = "my-topic" +# +# ## Required. Data format to consume. +# ## Each data format has its own unique set of configuration options. +# ## Read more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" +# +# ## Optional. Filepath for GCP credentials JSON file to authorize calls to +# ## PubSub APIs. If not set explicitly, Telegraf will attempt to use +# ## Application Default Credentials, which is preferred. +# # credentials_file = "path/to/my/creds.json" +# +# ## Optional. If true, will send all metrics per write in one PubSub message. +# # send_batched = true +# +# ## The following publish_* parameters specifically configures batching +# ## requests made to the GCP Cloud PubSub API via the PubSub Golang library. Read +# ## more here: https://godoc.org/cloud.google.com/go/pubsub#PublishSettings +# +# ## Optional. Send a request to PubSub (i.e. actually publish a batch) +# ## when it has this many PubSub messages. If send_batched is true, +# ## this is ignored and treated as if it were 1. +# # publish_count_threshold = 1000 +# +# ## Optional. Send a request to PubSub (i.e. actually publish a batch) +# ## when it has this many PubSub messages. If send_batched is true, +# ## this is ignored and treated as if it were 1 +# # publish_byte_threshold = 1000000 +# +# ## Optional. Specifically configures requests made to the PubSub API. +# # publish_num_go_routines = 2 +# +# ## Optional. Specifies a timeout for requests to the PubSub API. +# # publish_timeout = "30s" +# +# ## Optional. If true, published PubSub message data will be base64-encoded. +# # base64_data = false +# +# ## Optional. PubSub attributes to add to metrics. +# # [[inputs.pubsub.attributes]] +# # my_attr = "tag_value" + + +# # Configuration for AWS CloudWatch output. +# [[outputs.cloudwatch]] +# ## Amazon REGION +# region = "us-east-1" +# +# ## Amazon Credentials +# ## Credentials are loaded in the following order +# ## 1) Assumed credentials via STS if role_arn is specified +# ## 2) explicit credentials from 'access_key' and 'secret_key' +# ## 3) shared profile from 'profile' +# ## 4) environment variables +# ## 5) shared credentials file +# ## 6) EC2 Instance Profile +# #access_key = "" +# #secret_key = "" +# #token = "" +# #role_arn = "" +# #profile = "" +# #shared_credential_file = "" +# +# ## Endpoint to make request against, the correct endpoint is automatically +# ## determined and this option should only be set if you wish to override the +# ## default. +# ## ex: endpoint_url = "http://localhost:8000" +# # endpoint_url = "" +# +# ## Namespace for the CloudWatch MetricDatums +# namespace = "InfluxData/Telegraf" +# +# ## If you have a large amount of metrics, you should consider to send statistic +# ## values instead of raw metrics which could not only improve performance but +# ## also save AWS API cost. If enable this flag, this plugin would parse the required +# ## CloudWatch statistic fields (count, min, max, and sum) and send them to CloudWatch. +# ## You could use basicstats aggregator to calculate those fields. If not all statistic +# ## fields are available, all fields would still be sent as raw metrics. +# # write_statistics = false +# +# ## Enable high resolution metrics of 1 second (if not enabled, standard resolution are of 60 seconds precision) +# # high_resolution_metrics = false + + +# # Configuration for CrateDB to send metrics to. +# [[outputs.cratedb]] +# # A github.com/jackc/pgx connection string. +# # See https://godoc.org/github.com/jackc/pgx#ParseDSN +# url = "postgres://user:password@localhost/schema?sslmode=disable" +# # Timeout for all CrateDB queries. +# timeout = "5s" +# # Name of the table to store metrics in. +# table = "metrics" +# # If true, and the metrics table does not exist, create it automatically. +# table_create = true + + +# # Configuration for DataDog API to send metrics to. +# [[outputs.datadog]] +# ## Datadog API key +# apikey = "my-secret-key" +# +# ## Connection timeout. +# # timeout = "5s" +# +# ## Write URL override; useful for debugging. +# # url = "https://app.datadoghq.com/api/v1/series" + + +# # Send metrics to nowhere at all +# [[outputs.discard]] +# # no configuration + + +# # Configuration for Elasticsearch to send metrics to. +# [[outputs.elasticsearch]] +# ## The full HTTP endpoint URL for your Elasticsearch instance +# ## Multiple urls can be specified as part of the same cluster, +# ## this means that only ONE of the urls will be written to each interval. +# urls = [ "http://node1.es.example.com:9200" ] # required. +# ## Elasticsearch client timeout, defaults to "5s" if not set. +# timeout = "5s" +# ## Set to true to ask Elasticsearch a list of all cluster nodes, +# ## thus it is not necessary to list all nodes in the urls config option. +# enable_sniffer = false +# ## Set the interval to check if the Elasticsearch nodes are available +# ## Setting to "0s" will disable the health check (not recommended in production) +# health_check_interval = "10s" +# ## HTTP basic authentication details +# # username = "telegraf" +# # password = "mypassword" +# +# ## Index Config +# ## The target index for metrics (Elasticsearch will create if it not exists). +# ## You can use the date specifiers below to create indexes per time frame. +# ## The metric timestamp will be used to decide the destination index name +# # %Y - year (2016) +# # %y - last two digits of year (00..99) +# # %m - month (01..12) +# # %d - day of month (e.g., 01) +# # %H - hour (00..23) +# # %V - week of the year (ISO week) (01..53) +# ## Additionally, you can specify a tag name using the notation {{tag_name}} +# ## which will be used as part of the index name. If the tag does not exist, +# ## the default tag value will be used. +# # index_name = "telegraf-{{host}}-%Y.%m.%d" +# # default_tag_value = "none" +# index_name = "telegraf-%Y.%m.%d" # required. +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Template Config +# ## Set to true if you want telegraf to manage its index template. +# ## If enabled it will create a recommended index template for telegraf indexes +# manage_template = true +# ## The template name used for telegraf indexes +# template_name = "telegraf" +# ## Set to true if you want telegraf to overwrite an existing template +# overwrite_template = false + + +# # Send metrics to command as input over stdin +# [[outputs.exec]] +# ## Command to ingest metrics via stdin. +# command = ["tee", "-a", "/dev/null"] +# +# ## Timeout for command to complete. +# # timeout = "5s" +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# # data_format = "influx" + + +# # Run executable as long-running output plugin +# [[outputs.execd]] +# ## Program to run as daemon +# command = ["my-telegraf-output", "--some-flag", "value"] +# +# ## Delay before the process is restarted after an unexpected termination +# restart_delay = "10s" +# +# ## Data format to export. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# data_format = "influx" + + +# # Send telegraf metrics to file(s) +# [[outputs.file]] +# ## Files to write to, "stdout" is a specially handled file. +# files = ["stdout", "/tmp/metrics.out"] +# +# ## Use batch serialization format instead of line based delimiting. The +# ## batch format allows for the production of non line based output formats and +# ## may more efficiently encode metric groups. +# # use_batch_format = false +# +# ## The file will be rotated after the time interval specified. When set +# ## to 0 no time based rotation is performed. +# # rotation_interval = "0d" +# +# ## The logfile will be rotated when it becomes larger than the specified +# ## size. When set to 0 no size based rotation is performed. +# # rotation_max_size = "0MB" +# +# ## Maximum number of rotated archives to keep, any older logs are deleted. +# ## If set to -1, no archives are removed. +# # rotation_max_archives = 5 +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# data_format = "influx" + + +# # Configuration for Graphite server to send metrics to +# [[outputs.graphite]] +# ## TCP endpoint for your graphite instance. +# ## If multiple endpoints are configured, output will be load balanced. +# ## Only one of the endpoints will be written to with each iteration. +# servers = ["localhost:2003"] +# ## Prefix metrics name +# prefix = "" +# ## Graphite output template +# ## see https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# template = "host.tags.measurement.field" +# +# ## Enable Graphite tags support +# # graphite_tag_support = false +# +# ## Character for separating metric name and field for Graphite tags +# # graphite_separator = "." +# +# ## Graphite templates patterns +# ## 1. Template for cpu +# ## 2. Template for disk* +# ## 3. Default template +# # templates = [ +# # "cpu tags.measurement.host.field", +# # "disk* measurement.field", +# # "host.measurement.tags.field" +# #] +# +# ## timeout in seconds for the write connection to graphite +# timeout = 2 +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Send telegraf metrics to graylog +# [[outputs.graylog]] +# ## UDP endpoint for your graylog instance. +# servers = ["127.0.0.1:12201"] +# +# ## The field to use as the GELF short_message, if unset the static string +# ## "telegraf" will be used. +# ## example: short_message_field = "message" +# # short_message_field = "" + + +# # Configurable HTTP health check resource based on metrics +# [[outputs.health]] +# ## Address and port to listen on. +# ## ex: service_address = "http://localhost:8080" +# ## service_address = "unix:///var/run/telegraf-health.sock" +# # service_address = "http://:8080" +# +# ## The maximum duration for reading the entire request. +# # read_timeout = "5s" +# ## The maximum duration for writing the entire response. +# # write_timeout = "5s" +# +# ## Username and password to accept for HTTP basic authentication. +# # basic_username = "user1" +# # basic_password = "secret" +# +# ## Allowed CA certificates for client certificates. +# # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## TLS server certificate and private key. +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## One or more check sub-tables should be defined, it is also recommended to +# ## use metric filtering to limit the metrics that flow into this output. +# ## +# ## When using the default buffer sizes, this example will fail when the +# ## metric buffer is half full. +# ## +# ## namepass = ["internal_write"] +# ## tagpass = { output = ["influxdb"] } +# ## +# ## [[outputs.health.compares]] +# ## field = "buffer_size" +# ## lt = 5000.0 +# ## +# ## [[outputs.health.contains]] +# ## field = "buffer_size" + + +# # A plugin that can transmit metrics over HTTP +# [[outputs.http]] +# ## URL is the address to send metrics to +# url = "http://127.0.0.1:8080/telegraf" +# +# ## Timeout for HTTP message +# # timeout = "5s" +# +# ## HTTP method, one of: "POST" or "PUT" +# # method = "POST" +# +# ## HTTP Basic Auth credentials +# # username = "username" +# # password = "pa$$word" +# +# ## OAuth2 Client Credentials Grant +# # client_id = "clientid" +# # client_secret = "secret" +# # token_url = "https://indentityprovider/oauth2/v1/token" +# # scopes = ["urn:opc:idm:__myscopes__"] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Data format to output. +# ## Each data format has it's own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# # data_format = "influx" +# +# ## HTTP Content-Encoding for write request body, can be set to "gzip" to +# ## compress body or "identity" to apply no encoding. +# # content_encoding = "identity" +# +# ## Additional HTTP headers +# # [outputs.http.headers] +# # # Should be set manually to "application/json" for json data_format +# # Content-Type = "text/plain; charset=utf-8" + + +# # Configuration for sending metrics to InfluxDB +# [[outputs.influxdb_v2]] +# ## The URLs of the InfluxDB cluster nodes. +# ## +# ## Multiple URLs can be specified for a single cluster, only ONE of the +# ## urls will be written to each interval. +# ## ex: urls = ["https://us-west-2-1.aws.cloud2.influxdata.com"] +# urls = ["http://127.0.0.1:9999"] +# +# ## Token for authentication. +# token = "" +# +# ## Organization is the name of the organization you wish to write to; must exist. +# organization = "" +# +# ## Destination bucket to write into. +# bucket = "" +# +# ## The value of this tag will be used to determine the bucket. If this +# ## tag is not set the 'bucket' option is used as the default. +# # bucket_tag = "" +# +# ## If true, the bucket tag will not be added to the metric. +# # exclude_bucket_tag = false +# +# ## Timeout for HTTP messages. +# # timeout = "5s" +# +# ## Additional HTTP headers +# # http_headers = {"X-Special-Header" = "Special-Value"} +# +# ## HTTP Proxy override, if unset values the standard proxy environment +# ## variables are consulted to determine which proxy, if any, should be used. +# # http_proxy = "http://corporate.proxy:3128" +# +# ## HTTP User-Agent +# # user_agent = "telegraf" +# +# ## Content-Encoding for write request body, can be set to "gzip" to +# ## compress body or "identity" to apply no encoding. +# # content_encoding = "gzip" +# +# ## Enable or disable uint support for writing uints influxdb 2.0. +# # influx_uint_support = false +# +# ## Optional TLS Config for use on HTTP connections. +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Configuration for sending metrics to an Instrumental project +# [[outputs.instrumental]] +# ## Project API Token (required) +# api_token = "API Token" # required +# ## Prefix the metrics with a given name +# prefix = "" +# ## Stats output template (Graphite formatting) +# ## see https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md#graphite +# template = "host.tags.measurement.field" +# ## Timeout in seconds to connect +# timeout = "2s" +# ## Display Communication to Instrumental +# debug = false + + +# # Configuration for the Kafka server to send metrics to +# [[outputs.kafka]] +# ## URLs of kafka brokers +# brokers = ["localhost:9092"] +# ## Kafka topic for producer messages +# topic = "telegraf" +# +# ## The value of this tag will be used as the topic. If not set the 'topic' +# ## option is used. +# # topic_tag = "" +# +# ## If true, the 'topic_tag' will be removed from to the metric. +# # exclude_topic_tag = false +# +# ## Optional Client id +# # client_id = "Telegraf" +# +# ## Set the minimal supported Kafka version. Setting this enables the use of new +# ## Kafka features and APIs. Of particular interest, lz4 compression +# ## requires at least version 0.10.0.0. +# ## ex: version = "1.1.0" +# # version = "" +# +# ## Optional topic suffix configuration. +# ## If the section is omitted, no suffix is used. +# ## Following topic suffix methods are supported: +# ## measurement - suffix equals to separator + measurement's name +# ## tags - suffix equals to separator + specified tags' values +# ## interleaved with separator +# +# ## Suffix equals to "_" + measurement name +# # [outputs.kafka.topic_suffix] +# # method = "measurement" +# # separator = "_" +# +# ## Suffix equals to "__" + measurement's "foo" tag value. +# ## If there's no such a tag, suffix equals to an empty string +# # [outputs.kafka.topic_suffix] +# # method = "tags" +# # keys = ["foo"] +# # separator = "__" +# +# ## Suffix equals to "_" + measurement's "foo" and "bar" +# ## tag values, separated by "_". If there is no such tags, +# ## their values treated as empty strings. +# # [outputs.kafka.topic_suffix] +# # method = "tags" +# # keys = ["foo", "bar"] +# # separator = "_" +# +# ## The routing tag specifies a tagkey on the metric whose value is used as +# ## the message key. The message key is used to determine which partition to +# ## send the message to. This tag is prefered over the routing_key option. +# routing_tag = "host" +# +# ## The routing key is set as the message key and used to determine which +# ## partition to send the message to. This value is only used when no +# ## routing_tag is set or as a fallback when the tag specified in routing tag +# ## is not found. +# ## +# ## If set to "random", a random value will be generated for each message. +# ## +# ## When unset, no message key is added and each message is routed to a random +# ## partition. +# ## +# ## ex: routing_key = "random" +# ## routing_key = "telegraf" +# # routing_key = "" +# +# ## CompressionCodec represents the various compression codecs recognized by +# ## Kafka in messages. +# ## 0 : No compression +# ## 1 : Gzip compression +# ## 2 : Snappy compression +# ## 3 : LZ4 compression +# # compression_codec = 0 +# +# ## RequiredAcks is used in Produce Requests to tell the broker how many +# ## replica acknowledgements it must see before responding +# ## 0 : the producer never waits for an acknowledgement from the broker. +# ## This option provides the lowest latency but the weakest durability +# ## guarantees (some data will be lost when a server fails). +# ## 1 : the producer gets an acknowledgement after the leader replica has +# ## received the data. This option provides better durability as the +# ## client waits until the server acknowledges the request as successful +# ## (only messages that were written to the now-dead leader but not yet +# ## replicated will be lost). +# ## -1: the producer gets an acknowledgement after all in-sync replicas have +# ## received the data. This option provides the best durability, we +# ## guarantee that no messages will be lost as long as at least one in +# ## sync replica remains. +# # required_acks = -1 +# +# ## The maximum number of times to retry sending a metric before failing +# ## until the next flush. +# # max_retry = 3 +# +# ## The maximum permitted size of a message. Should be set equal to or +# ## smaller than the broker's 'message.max.bytes'. +# # max_message_bytes = 1000000 +# +# ## Optional TLS Config +# # enable_tls = true +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Optional SASL Config +# # sasl_username = "kafka" +# # sasl_password = "secret" +# +# ## SASL protocol version. When connecting to Azure EventHub set to 0. +# # sasl_version = 1 +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# # data_format = "influx" + + +# # Configuration for the AWS Kinesis output. +# [[outputs.kinesis]] +# ## Amazon REGION of kinesis endpoint. +# region = "ap-southeast-2" +# +# ## Amazon Credentials +# ## Credentials are loaded in the following order +# ## 1) Assumed credentials via STS if role_arn is specified +# ## 2) explicit credentials from 'access_key' and 'secret_key' +# ## 3) shared profile from 'profile' +# ## 4) environment variables +# ## 5) shared credentials file +# ## 6) EC2 Instance Profile +# #access_key = "" +# #secret_key = "" +# #token = "" +# #role_arn = "" +# #profile = "" +# #shared_credential_file = "" +# +# ## Endpoint to make request against, the correct endpoint is automatically +# ## determined and this option should only be set if you wish to override the +# ## default. +# ## ex: endpoint_url = "http://localhost:8000" +# # endpoint_url = "" +# +# ## Kinesis StreamName must exist prior to starting telegraf. +# streamname = "StreamName" +# ## DEPRECATED: PartitionKey as used for sharding data. +# partitionkey = "PartitionKey" +# ## DEPRECATED: If set the partitionKey will be a random UUID on every put. +# ## This allows for scaling across multiple shards in a stream. +# ## This will cause issues with ordering. +# use_random_partitionkey = false +# ## The partition key can be calculated using one of several methods: +# ## +# ## Use a static value for all writes: +# # [outputs.kinesis.partition] +# # method = "static" +# # key = "howdy" +# # +# ## Use a random partition key on each write: +# # [outputs.kinesis.partition] +# # method = "random" +# # +# ## Use the measurement name as the partition key: +# # [outputs.kinesis.partition] +# # method = "measurement" +# # +# ## Use the value of a tag for all writes, if the tag is not set the empty +# ## default option will be used. When no default, defaults to "telegraf" +# # [outputs.kinesis.partition] +# # method = "tag" +# # key = "host" +# # default = "mykey" +# +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# data_format = "influx" +# +# ## debug will show upstream aws messages. +# debug = false + + +# # Configuration for Librato API to send metrics to. +# [[outputs.librato]] +# ## Librato API Docs +# ## http://dev.librato.com/v1/metrics-authentication +# ## Librato API user +# api_user = "telegraf@influxdb.com" # required. +# ## Librato API token +# api_token = "my-secret-token" # required. +# ## Debug +# # debug = false +# ## Connection timeout. +# # timeout = "5s" +# ## Output source Template (same as graphite buckets) +# ## see https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md#graphite +# ## This template is used in librato's source (not metric's name) +# template = "host" +# + + +# # Configuration for MQTT server to send metrics to +# [[outputs.mqtt]] +# servers = ["localhost:1883"] # required. +# +# ## MQTT outputs send metrics to this topic format +# ## "///" +# ## ex: prefix/web01.example.com/mem +# topic_prefix = "telegraf" +# +# ## QoS policy for messages +# ## 0 = at most once +# ## 1 = at least once +# ## 2 = exactly once +# # qos = 2 +# +# ## username and password to connect MQTT server. +# # username = "telegraf" +# # password = "metricsmetricsmetricsmetrics" +# +# ## client ID, if not set a random ID is generated +# # client_id = "" +# +# ## Timeout for write operations. default: 5s +# # timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## When true, metrics will be sent in one MQTT message per flush. Otherwise, +# ## metrics are written one metric per MQTT message. +# # batch = false +# +# ## When true, metric will have RETAIN flag set, making broker cache entries until someone +# ## actually reads it +# # retain = false +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# data_format = "influx" + + +# # Send telegraf measurements to NATS +# [[outputs.nats]] +# ## URLs of NATS servers +# servers = ["nats://localhost:4222"] +# +# ## Optional credentials +# # username = "" +# # password = "" +# +# ## Optional NATS 2.0 and NATS NGS compatible user credentials +# # credentials = "/etc/telegraf/nats.creds" +# +# ## NATS subject for producer messages +# subject = "telegraf" +# +# ## Use Transport Layer Security +# # secure = false +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# data_format = "influx" + + +# # Send metrics to New Relic metrics endpoint +# [[outputs.newrelic]] +# ## New Relic Insights API key +# insights_key = "insights api key" +# +# ## Prefix to add to add to metric name for easy identification. +# # metric_prefix = "" +# +# ## Timeout for writes to the New Relic API. +# # timeout = "15s" + + +# # Send telegraf measurements to NSQD +# [[outputs.nsq]] +# ## Location of nsqd instance listening on TCP +# server = "localhost:4150" +# ## NSQ topic for producer messages +# topic = "telegraf" +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# data_format = "influx" + + +# # Configuration for OpenTSDB server to send metrics to +# [[outputs.opentsdb]] +# ## prefix for metrics keys +# prefix = "my.specific.prefix." +# +# ## DNS name of the OpenTSDB server +# ## Using "opentsdb.example.com" or "tcp://opentsdb.example.com" will use the +# ## telnet API. "http://opentsdb.example.com" will use the Http API. +# host = "opentsdb.example.com" +# +# ## Port of the OpenTSDB server +# port = 4242 +# +# ## Number of data points to send to OpenTSDB in Http requests. +# ## Not used with telnet API. +# http_batch_size = 50 +# +# ## URI Path for Http requests to OpenTSDB. +# ## Used in cases where OpenTSDB is located behind a reverse proxy. +# http_path = "/api/put" +# +# ## Debug true - Prints OpenTSDB communication +# debug = false +# +# ## Separator separates measurement name from field +# separator = "_" + + +# # Configuration for the Prometheus client to spawn +# [[outputs.prometheus_client]] +# ## Address to listen on +# listen = ":9273" +# +# ## Metric version controls the mapping from Telegraf metrics into +# ## Prometheus format. When using the prometheus input, use the same value in +# ## both plugins to ensure metrics are round-tripped without modification. +# ## +# ## example: metric_version = 1; deprecated in 1.13 +# ## metric_version = 2; recommended version +# # metric_version = 1 +# +# ## Use HTTP Basic Authentication. +# # basic_username = "Foo" +# # basic_password = "Bar" +# +# ## If set, the IP Ranges which are allowed to access metrics. +# ## ex: ip_range = ["192.168.0.0/24", "192.168.1.0/30"] +# # ip_range = [] +# +# ## Path to publish the metrics on. +# # path = "/metrics" +# +# ## Expiration interval for each metric. 0 == no expiration +# # expiration_interval = "60s" +# +# ## Collectors to enable, valid entries are "gocollector" and "process". +# ## If unset, both are enabled. +# # collectors_exclude = ["gocollector", "process"] +# +# ## Send string metrics as Prometheus labels. +# ## Unless set to false all string metrics will be sent as labels. +# # string_as_label = true +# +# ## If set, enable TLS with the given certificate. +# # tls_cert = "/etc/ssl/telegraf.crt" +# # tls_key = "/etc/ssl/telegraf.key" +# +# ## Set one or more allowed client CA certificate file names to +# ## enable mutually authenticated TLS connections +# # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Export metric collection time. +# # export_timestamp = false + + +# # Configuration for the Riemann server to send metrics to +# [[outputs.riemann]] +# ## The full TCP or UDP URL of the Riemann server +# url = "tcp://localhost:5555" +# +# ## Riemann event TTL, floating-point time in seconds. +# ## Defines how long that an event is considered valid for in Riemann +# # ttl = 30.0 +# +# ## Separator to use between measurement and field name in Riemann service name +# ## This does not have any effect if 'measurement_as_attribute' is set to 'true' +# separator = "/" +# +# ## Set measurement name as Riemann attribute 'measurement', instead of prepending it to the Riemann service name +# # measurement_as_attribute = false +# +# ## Send string metrics as Riemann event states. +# ## Unless enabled all string metrics will be ignored +# # string_as_state = false +# +# ## A list of tag keys whose values get sent as Riemann tags. +# ## If empty, all Telegraf tag values will be sent as tags +# # tag_keys = ["telegraf","custom_tag"] +# +# ## Additional Riemann tags to send. +# # tags = ["telegraf-output"] +# +# ## Description for Riemann event +# # description_text = "metrics collected from telegraf" +# +# ## Riemann client write timeout, defaults to "5s" if not set. +# # timeout = "5s" + + +# # Configuration for the Riemann server to send metrics to +# [[outputs.riemann_legacy]] +# ## URL of server +# url = "localhost:5555" +# ## transport protocol to use either tcp or udp +# transport = "tcp" +# ## separator to use between input name and field name in Riemann service name +# separator = " " + + +# # Generic socket writer capable of handling multiple socket types. +# [[outputs.socket_writer]] +# ## URL to connect to +# # address = "tcp://127.0.0.1:8094" +# # address = "tcp://example.com:http" +# # address = "tcp4://127.0.0.1:8094" +# # address = "tcp6://127.0.0.1:8094" +# # address = "tcp6://[2001:db8::1]:8094" +# # address = "udp://127.0.0.1:8094" +# # address = "udp4://127.0.0.1:8094" +# # address = "udp6://127.0.0.1:8094" +# # address = "unix:///tmp/telegraf.sock" +# # address = "unixgram:///tmp/telegraf.sock" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Period between keep alive probes. +# ## Only applies to TCP sockets. +# ## 0 disables keep alive probes. +# ## Defaults to the OS configuration. +# # keep_alive_period = "5m" +# +# ## Content encoding for packet-based connections (i.e. UDP, unixgram). +# ## Can be set to "gzip" or to "identity" to apply no encoding. +# ## +# # content_encoding = "identity" +# +# ## Data format to generate. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# # data_format = "influx" + + +# # Configuration for Google Cloud Stackdriver to send metrics to +# [[outputs.stackdriver]] +# ## GCP Project +# project = "erudite-bloom-151019" +# +# ## The namespace for the metric descriptor +# namespace = "telegraf" +# +# ## Custom resource type +# # resource_type = "generic_node" +# +# ## Additional resource labels +# # [outputs.stackdriver.resource_labels] +# # node_id = "$HOSTNAME" +# # namespace = "myapp" +# # location = "eu-north0" + + +# # Configuration for Syslog server to send metrics to +# [[outputs.syslog]] +# ## URL to connect to +# ## ex: address = "tcp://127.0.0.1:8094" +# ## ex: address = "tcp4://127.0.0.1:8094" +# ## ex: address = "tcp6://127.0.0.1:8094" +# ## ex: address = "tcp6://[2001:db8::1]:8094" +# ## ex: address = "udp://127.0.0.1:8094" +# ## ex: address = "udp4://127.0.0.1:8094" +# ## ex: address = "udp6://127.0.0.1:8094" +# address = "tcp://127.0.0.1:8094" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Period between keep alive probes. +# ## Only applies to TCP sockets. +# ## 0 disables keep alive probes. +# ## Defaults to the OS configuration. +# # keep_alive_period = "5m" +# +# ## The framing technique with which it is expected that messages are +# ## transported (default = "octet-counting"). Whether the messages come +# ## using the octect-counting (RFC5425#section-4.3.1, RFC6587#section-3.4.1), +# ## or the non-transparent framing technique (RFC6587#section-3.4.2). Must +# ## be one of "octet-counting", "non-transparent". +# # framing = "octet-counting" +# +# ## The trailer to be expected in case of non-transparent framing (default = "LF"). +# ## Must be one of "LF", or "NUL". +# # trailer = "LF" +# +# ## SD-PARAMs settings +# ## Syslog messages can contain key/value pairs within zero or more +# ## structured data sections. For each unrecognized metric tag/field a +# ## SD-PARAMS is created. +# ## +# ## Example: +# ## [[outputs.syslog]] +# ## sdparam_separator = "_" +# ## default_sdid = "default@32473" +# ## sdids = ["foo@123", "bar@456"] +# ## +# ## input => xyzzy,x=y foo@123_value=42,bar@456_value2=84,something_else=1 +# ## output (structured data only) => [foo@123 value=42][bar@456 value2=84][default@32473 something_else=1 x=y] +# +# ## SD-PARAMs separator between the sdid and tag/field key (default = "_") +# # sdparam_separator = "_" +# +# ## Default sdid used for tags/fields that don't contain a prefix defined in +# ## the explicit sdids setting below If no default is specified, no SD-PARAMs +# ## will be used for unrecognized field. +# # default_sdid = "default@32473" +# +# ## List of explicit prefixes to extract from tag/field keys and use as the +# ## SDID, if they match (see above example for more details): +# # sdids = ["foo@123", "bar@456"] +# +# ## Default severity value. Severity and Facility are used to calculate the +# ## message PRI value (RFC5424#section-6.2.1). Used when no metric field +# ## with key "severity_code" is defined. If unset, 5 (notice) is the default +# # default_severity_code = 5 +# +# ## Default facility value. Facility and Severity are used to calculate the +# ## message PRI value (RFC5424#section-6.2.1). Used when no metric field with +# ## key "facility_code" is defined. If unset, 1 (user-level) is the default +# # default_facility_code = 1 +# +# ## Default APP-NAME value (RFC5424#section-6.2.5) +# ## Used when no metric tag with key "appname" is defined. +# ## If unset, "Telegraf" is the default +# # default_appname = "Telegraf" + + +# # Write metrics to Warp 10 +# [[outputs.warp10]] +# # Prefix to add to the measurement. +# prefix = "telegraf." +# +# # URL of the Warp 10 server +# warp_url = "http://localhost:8080" +# +# # Write token to access your app on warp 10 +# token = "Token" +# +# # Warp 10 query timeout +# # timeout = "15s" +# +# ## Print Warp 10 error body +# # print_error_body = false +# +# ## Max string error size +# # max_string_error_size = 511 +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Configuration for Wavefront server to send metrics to +# [[outputs.wavefront]] +# ## Url for Wavefront Direct Ingestion or using HTTP with Wavefront Proxy +# ## If using Wavefront Proxy, also specify port. example: http://proxyserver:2878 +# url = "https://metrics.wavefront.com" +# +# ## Authentication Token for Wavefront. Only required if using Direct Ingestion +# #token = "DUMMY_TOKEN" +# +# ## DNS name of the wavefront proxy server. Do not use if url is specified +# #host = "wavefront.example.com" +# +# ## Port that the Wavefront proxy server listens on. Do not use if url is specified +# #port = 2878 +# +# ## prefix for metrics keys +# #prefix = "my.specific.prefix." +# +# ## whether to use "value" for name of simple fields. default is false +# #simple_fields = false +# +# ## character to use between metric and field name. default is . (dot) +# #metric_separator = "." +# +# ## Convert metric name paths to use metricSeparator character +# ## When true will convert all _ (underscore) characters in final metric name. default is true +# #convert_paths = true +# +# ## Use Strict rules to sanitize metric and tag names from invalid characters +# ## When enabled forward slash (/) and comma (,) will be accepted +# #use_strict = false +# +# ## Use Regex to sanitize metric and tag names from invalid characters +# ## Regex is more thorough, but significantly slower. default is false +# #use_regex = false +# +# ## point tags to use as the source name for Wavefront (if none found, host will be used) +# #source_override = ["hostname", "address", "agent_host", "node_host"] +# +# ## whether to convert boolean values to numeric values, with false -> 0.0 and true -> 1.0. default is true +# #convert_bool = true +# +# ## Truncate metric tags to a total of 254 characters for the tag name value. Wavefront will reject any +# ## data point exceeding this limit if not truncated. Defaults to 'false' to provide backwards compatibility. +# #truncate_tags = false +# +# ## Define a mapping, namespaced by metric prefix, from string values to numeric values +# ## deprecated in 1.9; use the enum processor plugin +# #[[outputs.wavefront.string_to_number.elasticsearch]] +# # green = 1.0 +# # yellow = 0.5 +# # red = 0.0 + + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + + +# # Clone metrics and apply modifications. +# [[processors.clone]] +# ## All modifications on inputs and aggregators can be overridden: +# # name_override = "new_name" +# # name_prefix = "new_name_prefix" +# # name_suffix = "new_name_suffix" +# +# ## Tags to be added (all values must be strings) +# # [processors.clone.tags] +# # additional_tag = "tag_value" + + +# # Convert values to another metric value type +# [[processors.converter]] +# ## Tags to convert +# ## +# ## The table key determines the target type, and the array of key-values +# ## select the keys to convert. The array may contain globs. +# ## = [...] +# [processors.converter.tags] +# measurement = [] +# string = [] +# integer = [] +# unsigned = [] +# boolean = [] +# float = [] +# +# ## Fields to convert +# ## +# ## The table key determines the target type, and the array of key-values +# ## select the keys to convert. The array may contain globs. +# ## = [...] +# [processors.converter.fields] +# measurement = [] +# tag = [] +# string = [] +# integer = [] +# unsigned = [] +# boolean = [] +# float = [] + + +# # Dates measurements, tags, and fields that pass through this filter. +# [[processors.date]] +# ## New tag to create +# tag_key = "month" +# +# ## New field to create (cannot set both field_key and tag_key) +# # field_key = "month" +# +# ## Date format string, must be a representation of the Go "reference time" +# ## which is "Mon Jan 2 15:04:05 -0700 MST 2006". +# date_format = "Jan" +# +# ## If destination is a field, date format can also be one of +# ## "unix", "unix_ms", "unix_us", or "unix_ns", which will insert an integer field. +# # date_format = "unix" +# +# ## Offset duration added to the date string when writing the new tag. +# # date_offset = "0s" +# +# ## Timezone to use when creating the tag or field using a reference time +# ## string. This can be set to one of "UTC", "Local", or to a location name +# ## in the IANA Time Zone database. +# ## example: timezone = "America/Los_Angeles" +# # timezone = "UTC" + + +# # Filter metrics with repeating field values +# [[processors.dedup]] +# ## Maximum time to suppress output +# dedup_interval = "600s" + + +# # Defaults sets default value(s) for specified fields that are not set on incoming metrics. +# [[processors.defaults]] +# ## Ensures a set of fields always exists on your metric(s) with their +# ## respective default value. +# ## For any given field pair (key = default), if it's not set, a field +# ## is set on the metric with the specified default. +# ## +# ## A field is considered not set if it is nil on the incoming metric; +# ## or it is not nil but its value is an empty string or is a string +# ## of one or more spaces. +# ## = +# # [processors.defaults.fields] +# # field_1 = "bar" +# # time_idle = 0 +# # is_error = true + + +# # Map enum values according to given table. +# [[processors.enum]] +# [[processors.enum.mapping]] +# ## Name of the field to map +# field = "status" +# +# ## Name of the tag to map +# # tag = "status" +# +# ## Destination tag or field to be used for the mapped value. By default the +# ## source tag or field is used, overwriting the original value. +# dest = "status_code" +# +# ## Default value to be used for all values not contained in the mapping +# ## table. When unset, the unmodified value for the field will be used if no +# ## match is found. +# # default = 0 +# +# ## Table of mappings +# [processors.enum.mapping.value_mappings] +# green = 1 +# amber = 2 +# red = 3 + + +# # Run executable as long-running processor plugin +# [[processors.execd]] +# ## Program to run as daemon +# ## eg: command = ["/path/to/your_program", "arg1", "arg2"] +# command = ["cat"] +# +# ## Delay before the process is restarted after an unexpected termination +# restart_delay = "10s" + + +# # Performs file path manipulations on tags and fields +# [[processors.filepath]] +# ## Treat the tag value as a path and convert it to its last element, storing the result in a new tag +# # [[processors.filepath.basename]] +# # tag = "path" +# # dest = "basepath" +# +# ## Treat the field value as a path and keep all but the last element of path, typically the path's directory +# # [[processors.filepath.dirname]] +# # field = "path" +# +# ## Treat the tag value as a path, converting it to its the last element without its suffix +# # [[processors.filepath.stem]] +# # tag = "path" +# +# ## Treat the tag value as a path, converting it to the shortest path name equivalent +# ## to path by purely lexical processing +# # [[processors.filepath.clean]] +# # tag = "path" +# +# ## Treat the tag value as a path, converting it to a relative path that is lexically +# ## equivalent to the source path when joined to 'base_path' +# # [[processors.filepath.rel]] +# # tag = "path" +# # base_path = "/var/log" +# +# ## Treat the tag value as a path, replacing each separator character in path with a '/' character. Has only +# ## effect on Windows +# # [[processors.filepath.toslash]] +# # tag = "path" + + +# # Add a tag of the network interface name looked up over SNMP by interface number +# [[processors.ifname]] +# ## Name of tag holding the interface number +# # tag = "ifIndex" +# +# ## Name of output tag where service name will be added +# # dest = "ifName" +# +# ## Name of tag of the SNMP agent to request the interface name from +# # agent = "agent" +# +# ## Timeout for each request. +# # timeout = "5s" +# +# ## SNMP version; can be 1, 2, or 3. +# # version = 2 +# +# ## SNMP community string. +# # community = "public" +# +# ## Number of retries to attempt. +# # retries = 3 +# +# ## The GETBULK max-repetitions parameter. +# # max_repetitions = 10 +# +# ## SNMPv3 authentication and encryption options. +# ## +# ## Security Name. +# # sec_name = "myuser" +# ## Authentication protocol; one of "MD5", "SHA", or "". +# # auth_protocol = "MD5" +# ## Authentication password. +# # auth_password = "pass" +# ## Security Level; one of "noAuthNoPriv", "authNoPriv", or "authPriv". +# # sec_level = "authNoPriv" +# ## Context Name. +# # context_name = "" +# ## Privacy protocol used for encrypted messages; one of "DES", "AES" or "". +# # priv_protocol = "" +# ## Privacy password used for encrypted messages. +# # priv_password = "" +# +# ## max_parallel_lookups is the maximum number of SNMP requests to +# ## make at the same time. +# # max_parallel_lookups = 100 +# +# ## ordered controls whether or not the metrics need to stay in the +# ## same order this plugin received them in. If false, this plugin +# ## may change the order when data is cached. If you need metrics to +# ## stay in order set this to true. keeping the metrics ordered may +# ## be slightly slower +# # ordered = false +# +# ## cache_ttl is the amount of time interface names are cached for a +# ## given agent. After this period elapses if names are needed they +# ## will be retrieved again. +# # cache_ttl = "8h" + + +# # Apply metric modifications using override semantics. +# [[processors.override]] +# ## All modifications on inputs and aggregators can be overridden: +# # name_override = "new_name" +# # name_prefix = "new_name_prefix" +# # name_suffix = "new_name_suffix" +# +# ## Tags to be added (all values must be strings) +# # [processors.override.tags] +# # additional_tag = "tag_value" + + +# # Parse a value in a specified field/tag(s) and add the result in a new metric +# [[processors.parser]] +# ## The name of the fields whose value will be parsed. +# parse_fields = [] +# +# ## If true, incoming metrics are not emitted. +# drop_original = false +# +# ## If set to override, emitted metrics will be merged by overriding the +# ## original metric using the newly parsed metrics. +# merge = "override" +# +# ## The dataformat to be read from files +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Rotate a single valued metric into a multi field metric +# [[processors.pivot]] +# ## Tag to use for naming the new field. +# tag_key = "name" +# ## Field to use as the value of the new field. +# value_key = "value" + + +# # Given a tag of a TCP or UDP port number, add a tag of the service name looked up in the system services file +# [[processors.port_name]] +# [[processors.port_name]] +# ## Name of tag holding the port number +# # tag = "port" +# +# ## Name of output tag where service name will be added +# # dest = "service" +# +# ## Default tcp or udp +# # default_protocol = "tcp" + + +# # Print all metrics that pass through this filter. +# [[processors.printer]] + + +# # Transforms tag and field values with regex pattern +# [[processors.regex]] +# ## Tag and field conversions defined in a separate sub-tables +# # [[processors.regex.tags]] +# # ## Tag to change +# # key = "resp_code" +# # ## Regular expression to match on a tag value +# # pattern = "^(\\d)\\d\\d$" +# # ## Matches of the pattern will be replaced with this string. Use ${1} +# # ## notation to use the text of the first submatch. +# # replacement = "${1}xx" +# +# # [[processors.regex.fields]] +# # ## Field to change +# # key = "request" +# # ## All the power of the Go regular expressions available here +# # ## For example, named subgroups +# # pattern = "^/api(?P/[\\w/]+)\\S*" +# # replacement = "${method}" +# # ## If result_key is present, a new field will be created +# # ## instead of changing existing field +# # result_key = "method" +# +# ## Multiple conversions may be applied for one field sequentially +# ## Let's extract one more value +# # [[processors.regex.fields]] +# # key = "request" +# # pattern = ".*category=(\\w+).*" +# # replacement = "${1}" +# # result_key = "search_category" + + +# # Rename measurements, tags, and fields that pass through this filter. +# [[processors.rename]] + + +# # ReverseDNS does a reverse lookup on IP addresses to retrieve the DNS name +# [[processors.reverse_dns]] +# ## For optimal performance, you may want to limit which metrics are passed to this +# ## processor. eg: +# ## namepass = ["my_metric_*"] +# +# ## cache_ttl is how long the dns entries should stay cached for. +# ## generally longer is better, but if you expect a large number of diverse lookups +# ## you'll want to consider memory use. +# cache_ttl = "24h" +# +# ## lookup_timeout is how long should you wait for a single dns request to repsond. +# ## this is also the maximum acceptable latency for a metric travelling through +# ## the reverse_dns processor. After lookup_timeout is exceeded, a metric will +# ## be passed on unaltered. +# ## multiple simultaneous resolution requests for the same IP will only make a +# ## single rDNS request, and they will all wait for the answer for this long. +# lookup_timeout = "3s" +# +# ## max_parallel_lookups is the maximum number of dns requests to be in flight +# ## at the same time. Requesting hitting cached values do not count against this +# ## total, and neither do mulptiple requests for the same IP. +# ## It's probably best to keep this number fairly low. +# max_parallel_lookups = 10 +# +# ## ordered controls whether or not the metrics need to stay in the same order +# ## this plugin received them in. If false, this plugin will change the order +# ## with requests hitting cached results moving through immediately and not +# ## waiting on slower lookups. This may cause issues for you if you are +# ## depending on the order of metrics staying the same. If so, set this to true. +# ## keeping the metrics ordered may be slightly slower. +# ordered = false +# +# [[processors.reverse_dns.lookup]] +# ## get the ip from the field "source_ip", and put the result in the field "source_name" +# field = "source_ip" +# dest = "source_name" +# +# [[processors.reverse_dns.lookup]] +# ## get the ip from the tag "destination_ip", and put the result in the tag +# ## "destination_name". +# tag = "destination_ip" +# dest = "destination_name" +# +# ## If you would prefer destination_name to be a field instead, you can use a +# ## processors.converter after this one, specifying the order attribute. + + +# # Add the S2 Cell ID as a tag based on latitude and longitude fields +# [[processors.s2geo]] +# ## The name of the lat and lon fields containing WGS-84 latitude and +# ## longitude in decimal degrees. +# # lat_field = "lat" +# # lon_field = "lon" +# +# ## New tag to create +# # tag_key = "s2_cell_id" +# +# ## Cell level (see https://s2geometry.io/resources/s2cell_statistics.html) +# # cell_level = 9 + + +# # Process metrics using a Starlark script +# [[processors.starlark]] +# ## The Starlark source can be set as a string in this configuration file, or +# ## by referencing a file containing the script. Only one source or script +# ## should be set at once. +# ## +# ## Source of the Starlark script. +# source = ''' +# def apply(metric): +# return metric +# ''' +# +# ## File containing a Starlark script. +# # script = "/usr/local/bin/myscript.star" + + +# # Perform string processing on tags, fields, and measurements +# [[processors.strings]] +# ## Convert a tag value to uppercase +# # [[processors.strings.uppercase]] +# # tag = "method" +# +# ## Convert a field value to lowercase and store in a new field +# # [[processors.strings.lowercase]] +# # field = "uri_stem" +# # dest = "uri_stem_normalised" +# +# ## Convert a field value to titlecase +# # [[processors.strings.titlecase]] +# # field = "status" +# +# ## Trim leading and trailing whitespace using the default cutset +# # [[processors.strings.trim]] +# # field = "message" +# +# ## Trim leading characters in cutset +# # [[processors.strings.trim_left]] +# # field = "message" +# # cutset = "\t" +# +# ## Trim trailing characters in cutset +# # [[processors.strings.trim_right]] +# # field = "message" +# # cutset = "\r\n" +# +# ## Trim the given prefix from the field +# # [[processors.strings.trim_prefix]] +# # field = "my_value" +# # prefix = "my_" +# +# ## Trim the given suffix from the field +# # [[processors.strings.trim_suffix]] +# # field = "read_count" +# # suffix = "_count" +# +# ## Replace all non-overlapping instances of old with new +# # [[processors.strings.replace]] +# # measurement = "*" +# # old = ":" +# # new = "_" +# +# ## Trims strings based on width +# # [[processors.strings.left]] +# # field = "message" +# # width = 10 +# +# ## Decode a base64 encoded utf-8 string +# # [[processors.strings.base64decode]] +# # field = "message" + + +# # Restricts the number of tags that can pass through this filter and chooses which tags to preserve when over the limit. +# [[processors.tag_limit]] +# ## Maximum number of tags to preserve +# limit = 10 +# +# ## List of tags to preferentially preserve +# keep = ["foo", "bar", "baz"] + + +# # Uses a Go template to create a new tag +# [[processors.template]] +# ## Tag to set with the output of the template. +# tag = "topic" +# +# ## Go template used to create the tag value. In order to ease TOML +# ## escaping requirements, you may wish to use single quotes around the +# ## template string. +# template = '{{ .Tag "hostname" }}.{{ .Tag "level" }}' + + +# # Print all metrics that pass through this filter. +# [[processors.topk]] +# ## How many seconds between aggregations +# # period = 10 +# +# ## How many top metrics to return +# # k = 10 +# +# ## Over which tags should the aggregation be done. Globs can be specified, in +# ## which case any tag matching the glob will aggregated over. If set to an +# ## empty list is no aggregation over tags is done +# # group_by = ['*'] +# +# ## Over which fields are the top k are calculated +# # fields = ["value"] +# +# ## What aggregation to use. Options: sum, mean, min, max +# # aggregation = "mean" +# +# ## Instead of the top k largest metrics, return the bottom k lowest metrics +# # bottomk = false +# +# ## The plugin assigns each metric a GroupBy tag generated from its name and +# ## tags. If this setting is different than "" the plugin will add a +# ## tag (which name will be the value of this setting) to each metric with +# ## the value of the calculated GroupBy tag. Useful for debugging +# # add_groupby_tag = "" +# +# ## These settings provide a way to know the position of each metric in +# ## the top k. The 'add_rank_field' setting allows to specify for which +# ## fields the position is required. If the list is non empty, then a field +# ## will be added to each and every metric for each string present in this +# ## setting. This field will contain the ranking of the group that +# ## the metric belonged to when aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_rank' +# # add_rank_fields = [] +# +# ## These settings provide a way to know what values the plugin is generating +# ## when aggregating metrics. The 'add_aggregate_field' setting allows to +# ## specify for which fields the final aggregation value is required. If the +# ## list is non empty, then a field will be added to each every metric for +# ## each field present in this setting. This field will contain +# ## the computed aggregation for the group that the metric belonged to when +# ## aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_aggregate' +# # add_aggregate_fields = [] + + +# # Rotate multi field metric into several single field metrics +# [[processors.unpivot]] +# ## Tag to use for the name. +# tag_key = "name" +# ## Field to use for the name of the value. +# value_key = "value" + + +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### + + +# # Keep the aggregate basicstats of each metric passing through. +# [[aggregators.basicstats]] +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# +# ## Configures which basic stats to push as fields +# # stats = ["count", "min", "max", "mean", "stdev", "s2", "sum"] + + +# # Report the final metric of a series +# [[aggregators.final]] +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# +# ## The time that a series is not updated until considering it final. +# series_timeout = "5m" + + +# # Create aggregate histograms. +# [[aggregators.histogram]] +# ## The period in which to flush the aggregator. +# period = "30s" +# +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# +# ## If true, the histogram will be reset on flush instead +# ## of accumulating the results. +# reset = false +# +# ## Whether bucket values should be accumulated. If set to false, "gt" tag will be added. +# ## Defaults to true. +# cumulative = true +# +# ## Example config that aggregates all fields of the metric. +# # [[aggregators.histogram.config]] +# # ## Right borders of buckets (with +Inf implicitly added). +# # buckets = [0.0, 15.6, 34.5, 49.1, 71.5, 80.5, 94.5, 100.0] +# # ## The name of metric. +# # measurement_name = "cpu" +# +# ## Example config that aggregates only specific fields of the metric. +# # [[aggregators.histogram.config]] +# # ## Right borders of buckets (with +Inf implicitly added). +# # buckets = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] +# # ## The name of metric. +# # measurement_name = "diskio" +# # ## The concrete fields of metric +# # fields = ["io_time", "read_time", "write_time"] + + +# # Merge metrics into multifield metrics by series key +# [[aggregators.merge]] +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = true + + +# # Keep the aggregate min/max of each metric passing through. +# [[aggregators.minmax]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Count the occurrence of values in fields. +# [[aggregators.valuecounter]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# ## The fields for which the values will be counted +# fields = [] + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + + +# Read metrics about cpu usage +[[inputs.cpu]] + ## Whether to report per-cpu stats or not + percpu = true + ## Whether to report total system cpu stats or not + totalcpu = true + ## If true, collect raw CPU time metrics. + collect_cpu_time = false + ## If true, compute and report the sum of all non-idle CPU states. + report_active = false + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default stats will be gathered for all mount points. + ## Set mount_points will restrict the stats to only the specified mount points. + # mount_points = ["/"] + + ## Ignore mount points by filesystem type. + ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] + + +# Read metrics about disk IO by device +[[inputs.diskio]] + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + # devices = ["sda", "sdb", "vd*"] + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + # + ## On systems which support it, device metadata can be added in the form of + ## tags. + ## Currently only Linux is supported via udev properties. You can view + ## available properties for a device by running: + ## 'udevadm info -q property -n /dev/sda' + ## Note: Most, but not all, udev properties can be accessed this way. Properties + ## that are currently inaccessible include DEVTYPE, DEVNAME, and DEVPATH. + # device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"] + # + ## Using the same metadata source as device_tags, you can also customize the + ## name of the device via templates. + ## The 'name_templates' parameter is a list of templates to try and apply to + ## the device. The template may contain variables in the form of '$PROPERTY' or + ## '${PROPERTY}'. The first template which does not contain any variables not + ## present for the device is used as the device name tag. + ## The typical use case is for LVM volumes, to get the VG/LV name instead of + ## the near-meaningless DM-0 name. + # name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"] + + +# Get kernel statistics from /proc/stat +[[inputs.kernel]] + # no configuration + + +# Read metrics about memory usage +[[inputs.mem]] + # no configuration + + +# Get the number of processes and group them by status +[[inputs.processes]] + # no configuration + + +# Read metrics about swap memory usage +[[inputs.swap]] + # no configuration + + +# Read metrics about system load & uptime +[[inputs.system]] + ## Uncomment to remove deprecated metrics. + # fielddrop = ["uptime_format"] + + +# # Gather ActiveMQ metrics +# [[inputs.activemq]] +# ## ActiveMQ WebConsole URL +# url = "http://127.0.0.1:8161" +# +# ## Required ActiveMQ Endpoint +# ## deprecated in 1.11; use the url option +# # server = "127.0.0.1" +# # port = 8161 +# +# ## Credentials for basic HTTP authentication +# # username = "admin" +# # password = "admin" +# +# ## Required ActiveMQ webadmin root path +# # webadmin = "admin" +# +# ## Maximum time to receive response. +# # response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read stats from aerospike server(s) +# [[inputs.aerospike]] +# ## Aerospike servers to connect to (with port) +# ## This plugin will query all namespaces the aerospike +# ## server has configured and get stats for them. +# servers = ["localhost:3000"] +# +# # username = "telegraf" +# # password = "pa$$word" +# +# ## Optional TLS Config +# # enable_tls = false +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## If false, skip chain & host verification +# # insecure_skip_verify = true + + +# # Read Apache status information (mod_status) +# [[inputs.apache]] +# ## An array of URLs to gather from, must be directed at the machine +# ## readable version of the mod_status page including the auto query string. +# ## Default is "http://localhost/server-status?auto". +# urls = ["http://localhost/server-status?auto"] +# +# ## Credentials for basic HTTP authentication. +# # username = "myuser" +# # password = "mypassword" +# +# ## Maximum time to receive response. +# # response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Monitor APC UPSes connected to apcupsd +# [[inputs.apcupsd]] +# # A list of running apcupsd server to connect to. +# # If not provided will default to tcp://127.0.0.1:3551 +# servers = ["tcp://127.0.0.1:3551"] +# +# ## Timeout for dialing server. +# timeout = "5s" + + +# # Gather metrics from Apache Aurora schedulers +# [[inputs.aurora]] +# ## Schedulers are the base addresses of your Aurora Schedulers +# schedulers = ["http://127.0.0.1:8081"] +# +# ## Set of role types to collect metrics from. +# ## +# ## The scheduler roles are checked each interval by contacting the +# ## scheduler nodes; zookeeper is not contacted. +# # roles = ["leader", "follower"] +# +# ## Timeout is the max time for total network operations. +# # timeout = "5s" +# +# ## Username and password are sent using HTTP Basic Auth. +# # username = "username" +# # password = "pa$$word" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Gather Azure Storage Queue metrics +# [[inputs.azure_storage_queue]] +# ## Required Azure Storage Account name +# account_name = "mystorageaccount" +# +# ## Required Azure Storage Account access key +# account_key = "storageaccountaccesskey" +# +# ## Set to false to disable peeking age of oldest message (executes faster) +# # peek_oldest_message_age = true + + +# # Read metrics of bcache from stats_total and dirty_data +# [[inputs.bcache]] +# ## Bcache sets path +# ## If not specified, then default is: +# bcachePath = "/sys/fs/bcache" +# +# ## By default, telegraf gather stats for all bcache devices +# ## Setting devices will restrict the stats to the specified +# ## bcache devices. +# bcacheDevs = ["bcache0"] + + +# # Collects Beanstalkd server and tubes stats +# [[inputs.beanstalkd]] +# ## Server to collect data from +# server = "localhost:11300" +# +# ## List of tubes to gather stats about. +# ## If no tubes specified then data gathered for each tube on server reported by list-tubes command +# tubes = ["notifications"] + + +# # Read BIND nameserver XML statistics +# [[inputs.bind]] +# ## An array of BIND XML statistics URI to gather stats. +# ## Default is "http://localhost:8053/xml/v3". +# # urls = ["http://localhost:8053/xml/v3"] +# # gather_memory_contexts = false +# # gather_views = false + + +# # Collect bond interface status, slaves statuses and failures count +# [[inputs.bond]] +# ## Sets 'proc' directory path +# ## If not specified, then default is /proc +# # host_proc = "/proc" +# +# ## By default, telegraf gather stats for all bond interfaces +# ## Setting interfaces will restrict the stats to the specified +# ## bond interfaces. +# # bond_interfaces = ["bond0"] + + +# # Collect Kafka topics and consumers status from Burrow HTTP API. +# [[inputs.burrow]] +# ## Burrow API endpoints in format "schema://host:port". +# ## Default is "http://localhost:8000". +# servers = ["http://localhost:8000"] +# +# ## Override Burrow API prefix. +# ## Useful when Burrow is behind reverse-proxy. +# # api_prefix = "/v3/kafka" +# +# ## Maximum time to receive response. +# # response_timeout = "5s" +# +# ## Limit per-server concurrent connections. +# ## Useful in case of large number of topics or consumer groups. +# # concurrent_connections = 20 +# +# ## Filter clusters, default is no filtering. +# ## Values can be specified as glob patterns. +# # clusters_include = [] +# # clusters_exclude = [] +# +# ## Filter consumer groups, default is no filtering. +# ## Values can be specified as glob patterns. +# # groups_include = [] +# # groups_exclude = [] +# +# ## Filter topics, default is no filtering. +# ## Values can be specified as glob patterns. +# # topics_include = [] +# # topics_exclude = [] +# +# ## Credentials for basic HTTP authentication. +# # username = "" +# # password = "" +# +# ## Optional SSL config +# # ssl_ca = "/etc/telegraf/ca.pem" +# # ssl_cert = "/etc/telegraf/cert.pem" +# # ssl_key = "/etc/telegraf/key.pem" +# # insecure_skip_verify = false + + +# # Collects performance metrics from the MON, OSD, MDS and RGW nodes in a Ceph storage cluster. +# [[inputs.ceph]] +# ## This is the recommended interval to poll. Too frequent and you will lose +# ## data points due to timeouts during rebalancing and recovery +# interval = '1m' +# +# ## All configuration values are optional, defaults are shown below +# +# ## location of ceph binary +# ceph_binary = "/usr/bin/ceph" +# +# ## directory in which to look for socket files +# socket_dir = "/var/run/ceph" +# +# ## prefix of MON and OSD socket files, used to determine socket type +# mon_prefix = "ceph-mon" +# osd_prefix = "ceph-osd" +# mds_prefix = "ceph-mds" +# rgw_prefix = "ceph-client" +# +# ## suffix used to identify socket files +# socket_suffix = "asok" +# +# ## Ceph user to authenticate as +# ceph_user = "client.admin" +# +# ## Ceph configuration to use to locate the cluster +# ceph_config = "/etc/ceph/ceph.conf" +# +# ## Whether to gather statistics via the admin socket +# gather_admin_socket_stats = true +# +# ## Whether to gather statistics via ceph commands +# gather_cluster_stats = false + + +# # Read specific statistics per cgroup +# [[inputs.cgroup]] +# ## Directories in which to look for files, globs are supported. +# ## Consider restricting paths to the set of cgroups you really +# ## want to monitor if you have a large number of cgroups, to avoid +# ## any cardinality issues. +# # paths = [ +# # "/sys/fs/cgroup/memory", +# # "/sys/fs/cgroup/memory/child1", +# # "/sys/fs/cgroup/memory/child2/*", +# # ] +# ## cgroup stat fields, as file names, globs are supported. +# ## these file names are appended to each path from above. +# # files = ["memory.*usage*", "memory.limit_in_bytes"] + + +# # Get standard chrony metrics, requires chronyc executable. +# [[inputs.chrony]] +# ## If true, chronyc tries to perform a DNS lookup for the time server. +# # dns_lookup = false + + +# # Pull Metric Statistics from Amazon CloudWatch +# [[inputs.cloudwatch]] +# ## Amazon Region +# region = "us-east-1" +# +# ## Amazon Credentials +# ## Credentials are loaded in the following order +# ## 1) Assumed credentials via STS if role_arn is specified +# ## 2) explicit credentials from 'access_key' and 'secret_key' +# ## 3) shared profile from 'profile' +# ## 4) environment variables +# ## 5) shared credentials file +# ## 6) EC2 Instance Profile +# # access_key = "" +# # secret_key = "" +# # token = "" +# # role_arn = "" +# # profile = "" +# # shared_credential_file = "" +# +# ## Endpoint to make request against, the correct endpoint is automatically +# ## determined and this option should only be set if you wish to override the +# ## default. +# ## ex: endpoint_url = "http://localhost:8000" +# # endpoint_url = "" +# +# # The minimum period for Cloudwatch metrics is 1 minute (60s). However not all +# # metrics are made available to the 1 minute period. Some are collected at +# # 3 minute, 5 minute, or larger intervals. See https://aws.amazon.com/cloudwatch/faqs/#monitoring. +# # Note that if a period is configured that is smaller than the minimum for a +# # particular metric, that metric will not be returned by the Cloudwatch API +# # and will not be collected by Telegraf. +# # +# ## Requested CloudWatch aggregation Period (required - must be a multiple of 60s) +# period = "5m" +# +# ## Collection Delay (required - must account for metrics availability via CloudWatch API) +# delay = "5m" +# +# ## Recommended: use metric 'interval' that is a multiple of 'period' to avoid +# ## gaps or overlap in pulled data +# interval = "5m" +# +# ## Configure the TTL for the internal cache of metrics. +# # cache_ttl = "1h" +# +# ## Metric Statistic Namespace (required) +# namespace = "AWS/ELB" +# +# ## Maximum requests per second. Note that the global default AWS rate limit is +# ## 50 reqs/sec, so if you define multiple namespaces, these should add up to a +# ## maximum of 50. +# ## See http://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/cloudwatch_limits.html +# # ratelimit = 25 +# +# ## Timeout for http requests made by the cloudwatch client. +# # timeout = "5s" +# +# ## Namespace-wide statistic filters. These allow fewer queries to be made to +# ## cloudwatch. +# # statistic_include = [ "average", "sum", "minimum", "maximum", sample_count" ] +# # statistic_exclude = [] +# +# ## Metrics to Pull +# ## Defaults to all Metrics in Namespace if nothing is provided +# ## Refreshes Namespace available metrics every 1h +# #[[inputs.cloudwatch.metrics]] +# # names = ["Latency", "RequestCount"] +# # +# # ## Statistic filters for Metric. These allow for retrieving specific +# # ## statistics for an individual metric. +# # # statistic_include = [ "average", "sum", "minimum", "maximum", sample_count" ] +# # # statistic_exclude = [] +# # +# # ## Dimension filters for Metric. All dimensions defined for the metric names +# # ## must be specified in order to retrieve the metric statistics. +# # [[inputs.cloudwatch.metrics.dimensions]] +# # name = "LoadBalancerName" +# # value = "p-example" + + +# # Collects conntrack stats from the configured directories and files. +# [[inputs.conntrack]] +# ## The following defaults would work with multiple versions of conntrack. +# ## Note the nf_ and ip_ filename prefixes are mutually exclusive across +# ## kernel versions, as are the directory locations. +# +# ## Superset of filenames to look for within the conntrack dirs. +# ## Missing files will be ignored. +# files = ["ip_conntrack_count","ip_conntrack_max", +# "nf_conntrack_count","nf_conntrack_max"] +# +# ## Directories to search within for the conntrack files above. +# ## Missing directories will be ignored. +# dirs = ["/proc/sys/net/ipv4/netfilter","/proc/sys/net/netfilter"] + + +# # Gather health check statuses from services registered in Consul +# [[inputs.consul]] +# ## Consul server address +# # address = "localhost:8500" +# +# ## URI scheme for the Consul server, one of "http", "https" +# # scheme = "http" +# +# ## ACL token used in every request +# # token = "" +# +# ## HTTP Basic Authentication username and password. +# # username = "" +# # password = "" +# +# ## Data center to query the health checks from +# # datacenter = "" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = true +# +# ## Consul checks' tag splitting +# # When tags are formatted like "key:value" with ":" as a delimiter then +# # they will be splitted and reported as proper key:value in Telegraf +# # tag_delimiter = ":" + + +# # Read metrics from one or many couchbase clusters +# [[inputs.couchbase]] +# ## specify servers via a url matching: +# ## [protocol://][:password]@address[:port] +# ## e.g. +# ## http://couchbase-0.example.com/ +# ## http://admin:secret@couchbase-0.example.com:8091/ +# ## +# ## If no servers are specified, then localhost is used as the host. +# ## If no protocol is specified, HTTP is used. +# ## If no port is specified, 8091 is used. +# servers = ["http://localhost:8091"] + + +# # Read CouchDB Stats from one or more servers +# [[inputs.couchdb]] +# ## Works with CouchDB stats endpoints out of the box +# ## Multiple Hosts from which to read CouchDB stats: +# hosts = ["http://localhost:8086/_stats"] +# +# ## Use HTTP Basic Authentication. +# # basic_username = "telegraf" +# # basic_password = "p@ssw0rd" + + +# # Input plugin for DC/OS metrics +# [[inputs.dcos]] +# ## The DC/OS cluster URL. +# cluster_url = "https://dcos-ee-master-1" +# +# ## The ID of the service account. +# service_account_id = "telegraf" +# ## The private key file for the service account. +# service_account_private_key = "/etc/telegraf/telegraf-sa-key.pem" +# +# ## Path containing login token. If set, will read on every gather. +# # token_file = "/home/dcos/.dcos/token" +# +# ## In all filter options if both include and exclude are empty all items +# ## will be collected. Arrays may contain glob patterns. +# ## +# ## Node IDs to collect metrics from. If a node is excluded, no metrics will +# ## be collected for its containers or apps. +# # node_include = [] +# # node_exclude = [] +# ## Container IDs to collect container metrics from. +# # container_include = [] +# # container_exclude = [] +# ## Container IDs to collect app metrics from. +# # app_include = [] +# # app_exclude = [] +# +# ## Maximum concurrent connections to the cluster. +# # max_connections = 10 +# ## Maximum time to receive a response from cluster. +# # response_timeout = "20s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## If false, skip chain & host verification +# # insecure_skip_verify = true +# +# ## Recommended filtering to reduce series cardinality. +# # [inputs.dcos.tagdrop] +# # path = ["/var/lib/mesos/slave/slaves/*"] + + +# # Read metrics from one or many disque servers +# [[inputs.disque]] +# ## An array of URI to gather stats about. Specify an ip or hostname +# ## with optional port and password. +# ## ie disque://localhost, disque://10.10.3.33:18832, 10.0.0.1:10000, etc. +# ## If no servers are specified, then localhost is used as the host. +# servers = ["localhost"] + + +# # Provide a native collection for dmsetup based statistics for dm-cache +# [[inputs.dmcache]] +# ## Whether to report per-device stats or not +# per_device = true + + +# # Query given DNS server and gives statistics +# [[inputs.dns_query]] +# ## servers to query +# servers = ["8.8.8.8"] +# +# ## Network is the network protocol name. +# # network = "udp" +# +# ## Domains or subdomains to query. +# # domains = ["."] +# +# ## Query record type. +# ## Possible values: A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV. +# # record_type = "A" +# +# ## Dns server port. +# # port = 53 +# +# ## Query timeout in seconds. +# # timeout = 2 + + +# # Read metrics about docker containers +# [[inputs.docker]] +# ## Docker Endpoint +# ## To use TCP, set endpoint = "tcp://[ip]:[port]" +# ## To use environment variables (ie, docker-machine), set endpoint = "ENV" +# endpoint = "unix:///var/run/docker.sock" +# +# ## Set to true to collect Swarm metrics(desired_replicas, running_replicas) +# gather_services = false +# +# ## Only collect metrics for these containers, collect all if empty +# container_names = [] +# +# ## Set the source tag for the metrics to the container ID hostname, eg first 12 chars +# source_tag = false +# +# ## Containers to include and exclude. Globs accepted. +# ## Note that an empty array for both will include all containers +# container_name_include = [] +# container_name_exclude = [] +# +# ## Container states to include and exclude. Globs accepted. +# ## When empty only containers in the "running" state will be captured. +# ## example: container_state_include = ["created", "restarting", "running", "removing", "paused", "exited", "dead"] +# ## example: container_state_exclude = ["created", "restarting", "running", "removing", "paused", "exited", "dead"] +# # container_state_include = [] +# # container_state_exclude = [] +# +# ## Timeout for docker list, info, and stats commands +# timeout = "5s" +# +# ## Whether to report for each container per-device blkio (8:0, 8:1...) and +# ## network (eth0, eth1, ...) stats or not +# perdevice = true +# +# ## Whether to report for each container total blkio and network stats or not +# total = false +# +# ## Which environment variables should we use as a tag +# ##tag_env = ["JAVA_HOME", "HEAP_SIZE"] +# +# ## docker labels to include and exclude as tags. Globs accepted. +# ## Note that an empty array for both will include all labels as tags +# docker_label_include = [] +# docker_label_exclude = [] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read statistics from one or many dovecot servers +# [[inputs.dovecot]] +# ## specify dovecot servers via an address:port list +# ## e.g. +# ## localhost:24242 +# ## +# ## If no servers are specified, then localhost is used as the host. +# servers = ["localhost:24242"] +# +# ## Type is one of "user", "domain", "ip", or "global" +# type = "global" +# +# ## Wildcard matches like "*.com". An empty string "" is same as "*" +# ## If type = "ip" filters should be +# filters = [""] + + +# # Read metrics about docker containers from Fargate/ECS v2, v3 meta endpoints. +# [[inputs.ecs]] +# ## ECS metadata url. +# ## Metadata v2 API is used if set explicitly. Otherwise, +# ## v3 metadata endpoint API is used if available. +# # endpoint_url = "" +# +# ## Containers to include and exclude. Globs accepted. +# ## Note that an empty array for both will include all containers +# # container_name_include = [] +# # container_name_exclude = [] +# +# ## Container states to include and exclude. Globs accepted. +# ## When empty only containers in the "RUNNING" state will be captured. +# ## Possible values are "NONE", "PULLED", "CREATED", "RUNNING", +# ## "RESOURCES_PROVISIONED", "STOPPED". +# # container_status_include = [] +# # container_status_exclude = [] +# +# ## ecs labels to include and exclude as tags. Globs accepted. +# ## Note that an empty array for both will include all labels as tags +# ecs_label_include = [ "com.amazonaws.ecs.*" ] +# ecs_label_exclude = [] +# +# ## Timeout for queries. +# # timeout = "5s" + + +# # Read stats from one or more Elasticsearch servers or clusters +# [[inputs.elasticsearch]] +# ## specify a list of one or more Elasticsearch servers +# # you can add username and password to your url to use basic authentication: +# # servers = ["http://user:pass@localhost:9200"] +# servers = ["http://localhost:9200"] +# +# ## Timeout for HTTP requests to the elastic search server(s) +# http_timeout = "5s" +# +# ## When local is true (the default), the node will read only its own stats. +# ## Set local to false when you want to read the node stats from all nodes +# ## of the cluster. +# local = true +# +# ## Set cluster_health to true when you want to also obtain cluster health stats +# cluster_health = false +# +# ## Adjust cluster_health_level when you want to also obtain detailed health stats +# ## The options are +# ## - indices (default) +# ## - cluster +# # cluster_health_level = "indices" +# +# ## Set cluster_stats to true when you want to also obtain cluster stats. +# cluster_stats = false +# +# ## Only gather cluster_stats from the master node. To work this require local = true +# cluster_stats_only_from_master = true +# +# ## Indices to collect; can be one or more indices names or _all +# indices_include = ["_all"] +# +# ## One of "shards", "cluster", "indices" +# indices_level = "shards" +# +# ## node_stats is a list of sub-stats that you want to have gathered. Valid options +# ## are "indices", "os", "process", "jvm", "thread_pool", "fs", "transport", "http", +# ## "breaker". Per default, all stats are gathered. +# # node_stats = ["jvm", "http"] +# +# ## HTTP Basic Authentication username and password. +# # username = "" +# # password = "" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Returns ethtool statistics for given interfaces +[[inputs.ethtool]] +# ## List of interfaces to pull metrics for +# # interface_include = ["eth0"] +# +# ## List of interfaces to ignore when pulling metrics. +# # interface_exclude = ["eth1"] + + +# # Read metrics from one or more commands that can output to stdout +# [[inputs.exec]] +# ## Commands array +# commands = [ +# "/tmp/test.sh", +# "/usr/bin/mycollector --foo=bar", +# "/tmp/collect_*.sh" +# ] +# +# ## Timeout for each command to complete. +# timeout = "5s" +# +# ## measurement name suffix (for separating different commands) +# name_suffix = "_mycollector" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read metrics from fail2ban. +# [[inputs.fail2ban]] +# ## Use sudo to run fail2ban-client +# use_sudo = false + + +# # Read devices value(s) from a Fibaro controller +# [[inputs.fibaro]] +# ## Required Fibaro controller address/hostname. +# ## Note: at the time of writing this plugin, Fibaro only implemented http - no https available +# url = "http://:80" +# +# ## Required credentials to access the API (http://) +# username = "" +# password = "" +# +# ## Amount of time allowed to complete the HTTP request +# # timeout = "5s" + + +# # Parse a complete file each interval +# [[inputs.file]] +# ## Files to parse each interval. Accept standard unix glob matching rules, +# ## as well as ** to match recursive files and directories. +# files = ["/tmp/metrics.out"] +# +# ## Name a tag containing the name of the file the data was parsed from. Leave empty +# ## to disable. +# # file_tag = "" +# +# ## Character encoding to use when interpreting the file contents. Invalid +# ## characters are replaced using the unicode replacement character. When set +# ## to the empty string the data is not decoded to text. +# ## ex: character_encoding = "utf-8" +# ## character_encoding = "utf-16le" +# ## character_encoding = "utf-16be" +# ## character_encoding = "" +# # character_encoding = "" +# +# ## The dataformat to be read from files +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Count files in a directory +# [[inputs.filecount]] +# ## Directory to gather stats about. +# ## deprecated in 1.9; use the directories option +# # directory = "/var/cache/apt/archives" +# +# ## Directories to gather stats about. +# ## This accept standard unit glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## /var/log/** -> recursively find all directories in /var/log and count files in each directories +# ## /var/log/*/* -> find all directories with a parent dir in /var/log and count files in each directories +# ## /var/log -> count all files in /var/log and all of its subdirectories +# directories = ["/var/cache/apt/archives"] +# +# ## Only count files that match the name pattern. Defaults to "*". +# name = "*.deb" +# +# ## Count files in subdirectories. Defaults to true. +# recursive = false +# +# ## Only count regular files. Defaults to true. +# regular_only = true +# +# ## Follow all symlinks while walking the directory tree. Defaults to false. +# follow_symlinks = false +# +# ## Only count files that are at least this size. If size is +# ## a negative number, only count files that are smaller than the +# ## absolute value of size. Acceptable units are B, KiB, MiB, KB, ... +# ## Without quotes and units, interpreted as size in bytes. +# size = "0B" +# +# ## Only count files that have not been touched for at least this +# ## duration. If mtime is negative, only count files that have been +# ## touched in this duration. Defaults to "0s". +# mtime = "0s" + + +# # Read stats about given file(s) +# [[inputs.filestat]] +# ## Files to gather stats about. +# ## These accept standard unix glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## "/var/log/**.log" -> recursively find all .log files in /var/log +# ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log +# ## "/var/log/apache.log" -> just tail the apache log file +# ## +# ## See https://github.com/gobwas/glob for more examples +# ## +# files = ["/var/log/**.log"] +# +# ## If true, read the entire file and calculate an md5 checksum. +# md5 = false + + +# # Read real time temps from fireboard.io servers +# [[inputs.fireboard]] +# ## Specify auth token for your account +# auth_token = "invalidAuthToken" +# ## You can override the fireboard server URL if necessary +# # url = https://fireboard.io/api/v1/devices.json +# ## You can set a different http_timeout if you need to +# ## You should set a string using an number and time indicator +# ## for example "12s" for 12 seconds. +# # http_timeout = "4s" + + +# # Read metrics exposed by fluentd in_monitor plugin +# [[inputs.fluentd]] +# ## This plugin reads information exposed by fluentd (using /api/plugins.json endpoint). +# ## +# ## Endpoint: +# ## - only one URI is allowed +# ## - https is not supported +# endpoint = "http://localhost:24220/api/plugins.json" +# +# ## Define which plugins have to be excluded (based on "type" field - e.g. monitor_agent) +# exclude = [ +# "monitor_agent", +# "dummy", +# ] + + +# # Gather repository information from GitHub hosted repositories. +# [[inputs.github]] +# ## List of repositories to monitor. +# repositories = [ +# "influxdata/telegraf", +# "influxdata/influxdb" +# ] +# +# ## Github API access token. Unauthenticated requests are limited to 60 per hour. +# # access_token = "" +# +# ## Github API enterprise url. Github Enterprise accounts must specify their base url. +# # enterprise_base_url = "" +# +# ## Timeout for HTTP requests. +# # http_timeout = "5s" + + +# # Read flattened metrics from one or more GrayLog HTTP endpoints +# [[inputs.graylog]] +# ## API endpoint, currently supported API: +# ## +# ## - multiple (Ex http://:12900/system/metrics/multiple) +# ## - namespace (Ex http://:12900/system/metrics/namespace/{namespace}) +# ## +# ## For namespace endpoint, the metrics array will be ignored for that call. +# ## Endpoint can contain namespace and multiple type calls. +# ## +# ## Please check http://[graylog-server-ip]:12900/api-browser for full list +# ## of endpoints +# servers = [ +# "http://[graylog-server-ip]:12900/system/metrics/multiple", +# ] +# +# ## Metrics list +# ## List of metrics can be found on Graylog webservice documentation. +# ## Or by hitting the the web service api at: +# ## http://[graylog-host]:12900/system/metrics +# metrics = [ +# "jvm.cl.loaded", +# "jvm.memory.pools.Metaspace.committed" +# ] +# +# ## Username and password +# username = "" +# password = "" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics of haproxy, via socket or csv stats page +# [[inputs.haproxy]] +# ## An array of address to gather stats about. Specify an ip on hostname +# ## with optional port. ie localhost, 10.10.3.33:1936, etc. +# ## Make sure you specify the complete path to the stats endpoint +# ## including the protocol, ie http://10.10.3.33:1936/haproxy?stats +# +# ## If no servers are specified, then default to 127.0.0.1:1936/haproxy?stats +# servers = ["http://myhaproxy.com:1936/haproxy?stats"] +# +# ## Credentials for basic HTTP authentication +# # username = "admin" +# # password = "admin" +# +# ## You can also use local socket with standard wildcard globbing. +# ## Server address not starting with 'http' will be treated as a possible +# ## socket, so both examples below are valid. +# # servers = ["socket:/run/haproxy/admin.sock", "/run/haproxy/*.sock"] +# +# ## By default, some of the fields are renamed from what haproxy calls them. +# ## Setting this option to true results in the plugin keeping the original +# ## field names. +# # keep_field_names = false +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Monitor disks' temperatures using hddtemp +# [[inputs.hddtemp]] +# ## By default, telegraf gathers temps data from all disks detected by the +# ## hddtemp. +# ## +# ## Only collect temps from the selected disks. +# ## +# ## A * as the device name will return the temperature values of all disks. +# ## +# # address = "127.0.0.1:7634" +# # devices = ["sda", "*"] + + +# # Read formatted metrics from one or more HTTP endpoints +# [[inputs.http]] +# ## One or more URLs from which to read formatted metrics +# urls = [ +# "http://localhost/metrics" +# ] +# +# ## HTTP method +# # method = "GET" +# +# ## Optional HTTP headers +# # headers = {"X-Special-Header" = "Special-Value"} +# +# ## Optional file with Bearer token +# ## file content is added as an Authorization header +# # bearer_token = "/path/to/file" +# +# ## Optional HTTP Basic Auth Credentials +# # username = "username" +# # password = "pa$$word" +# +# ## HTTP entity-body to send with POST/PUT requests. +# # body = "" +# +# ## HTTP Content-Encoding for write request body, can be set to "gzip" to +# ## compress body or "identity" to apply no encoding. +# # content_encoding = "identity" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Amount of time allowed to complete the HTTP request +# # timeout = "5s" +# +# ## List of success status codes +# # success_status_codes = [200] +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# # data_format = "influx" + + +# # HTTP/HTTPS request given an address a method and a timeout +# [[inputs.http_response]] +# ## Deprecated in 1.12, use 'urls' +# ## Server address (default http://localhost) +# # address = "http://localhost" +# +# ## List of urls to query. +# # urls = ["http://localhost"] +# +# ## Set http_proxy (telegraf uses the system wide proxy settings if it's is not set) +# # http_proxy = "http://localhost:8888" +# +# ## Set response_timeout (default 5 seconds) +# # response_timeout = "5s" +# +# ## HTTP Request Method +# # method = "GET" +# +# ## Whether to follow redirects from the server (defaults to false) +# # follow_redirects = false +# +# ## Optional file with Bearer token +# ## file content is added as an Authorization header +# # bearer_token = "/path/to/file" +# +# ## Optional HTTP Basic Auth Credentials +# # username = "username" +# # password = "pa$$word" +# +# ## Optional HTTP Request Body +# # body = ''' +# # {'fake':'data'} +# # ''' +# +# ## Optional name of the field that will contain the body of the response. +# ## By default it is set to an empty String indicating that the body's content won't be added +# # response_body_field = '' +# +# ## Maximum allowed HTTP response body size in bytes. +# ## 0 means to use the default of 32MiB. +# ## If the response body size exceeds this limit a "body_read_error" will be raised +# # response_body_max_size = "32MiB" +# +# ## Optional substring or regex match in body of the response (case sensitive) +# # response_string_match = "\"service_status\": \"up\"" +# # response_string_match = "ok" +# # response_string_match = "\".*_status\".?:.?\"up\"" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## HTTP Request Headers (all values must be strings) +# # [inputs.http_response.headers] +# # Host = "github.com" +# +# ## Optional setting to map response http headers into tags +# ## If the http header is not present on the request, no corresponding tag will be added +# ## If multiple instances of the http header are present, only the first value will be used +# # http_header_tags = {"HTTP_HEADER" = "TAG_NAME"} +# +# ## Interface to use when dialing an address +# # interface = "eth0" + + +# # Read flattened metrics from one or more JSON HTTP endpoints +# [[inputs.httpjson]] +# ## NOTE This plugin only reads numerical measurements, strings and booleans +# ## will be ignored. +# +# ## Name for the service being polled. Will be appended to the name of the +# ## measurement e.g. httpjson_webserver_stats +# ## +# ## Deprecated (1.3.0): Use name_override, name_suffix, name_prefix instead. +# name = "webserver_stats" +# +# ## URL of each server in the service's cluster +# servers = [ +# "http://localhost:9999/stats/", +# "http://localhost:9998/stats/", +# ] +# ## Set response_timeout (default 5 seconds) +# response_timeout = "5s" +# +# ## HTTP method to use: GET or POST (case-sensitive) +# method = "GET" +# +# ## List of tag names to extract from top-level of JSON server response +# # tag_keys = [ +# # "my_tag_1", +# # "my_tag_2" +# # ] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## HTTP parameters (all values must be strings). For "GET" requests, data +# ## will be included in the query. For "POST" requests, data will be included +# ## in the request body as "x-www-form-urlencoded". +# # [inputs.httpjson.parameters] +# # event_type = "cpu_spike" +# # threshold = "0.75" +# +# ## HTTP Headers (all values must be strings) +# # [inputs.httpjson.headers] +# # X-Auth-Token = "my-xauth-token" +# # apiVersion = "v1" + + +# # Gather Icinga2 status +# [[inputs.icinga2]] +# ## Required Icinga2 server address +# # server = "https://localhost:5665" +# +# ## Required Icinga2 object type ("services" or "hosts") +# # object_type = "services" +# +# ## Credentials for basic HTTP authentication +# # username = "admin" +# # password = "admin" +# +# ## Maximum time to receive response. +# # response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = true + + +# # Gets counters from all InfiniBand cards and ports installed +# [[inputs.infiniband]] +# # no configuration + + +# # Read InfluxDB-formatted JSON metrics from one or more HTTP endpoints +# [[inputs.influxdb]] +# ## Works with InfluxDB debug endpoints out of the box, +# ## but other services can use this format too. +# ## See the influxdb plugin's README for more details. +# +# ## Multiple URLs from which to read InfluxDB-formatted JSON +# ## Default is "http://localhost:8086/debug/vars". +# urls = [ +# "http://localhost:8086/debug/vars" +# ] +# +# ## Username and password to send using HTTP Basic Authentication. +# # username = "" +# # password = "" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## http request & header timeout +# timeout = "5s" + + +# # Collect statistics about itself +# [[inputs.internal]] +# ## If true, collect telegraf memory stats. +# # collect_memstats = true + + +# # This plugin gathers interrupts data from /proc/interrupts and /proc/softirqs. +# [[inputs.interrupts]] +# ## When set to true, cpu metrics are tagged with the cpu. Otherwise cpu is +# ## stored as a field. +# ## +# ## The default is false for backwards compatibility, and will be changed to +# ## true in a future version. It is recommended to set to true on new +# ## deployments. +# # cpu_as_tag = false +# +# ## To filter which IRQs to collect, make use of tagpass / tagdrop, i.e. +# # [inputs.interrupts.tagdrop] +# # irq = [ "NET_RX", "TASKLET" ] + + +# # Read metrics from the bare metal servers via IPMI +# [[inputs.ipmi_sensor]] +# ## optionally specify the path to the ipmitool executable +# # path = "/usr/bin/ipmitool" +# ## +# ## Setting 'use_sudo' to true will make use of sudo to run ipmitool. +# ## Sudo must be configured to allow the telegraf user to run ipmitool +# ## without a password. +# # use_sudo = false +# ## +# ## optionally force session privilege level. Can be CALLBACK, USER, OPERATOR, ADMINISTRATOR +# # privilege = "ADMINISTRATOR" +# ## +# ## optionally specify one or more servers via a url matching +# ## [username[:password]@][protocol[(address)]] +# ## e.g. +# ## root:passwd@lan(127.0.0.1) +# ## +# ## if no servers are specified, local machine sensor stats will be queried +# ## +# # servers = ["USERID:PASSW0RD@lan(192.168.1.1)"] +# +# ## Recommended: use metric 'interval' that is a multiple of 'timeout' to avoid +# ## gaps or overlap in pulled data +# interval = "30s" +# +# ## Timeout for the ipmitool command to complete +# timeout = "20s" +# +# ## Schema Version: (Optional, defaults to version 1) +# metric_version = 2 + + +# # Gather packets and bytes counters from Linux ipsets +# [[inputs.ipset]] +# ## By default, we only show sets which have already matched at least 1 packet. +# ## set include_unmatched_sets = true to gather them all. +# include_unmatched_sets = false +# ## Adjust your sudo settings appropriately if using this option ("sudo ipset save") +# use_sudo = false +# ## The default timeout of 1s for ipset execution can be overridden here: +# # timeout = "1s" + + +# # Gather packets and bytes throughput from iptables +# [[inputs.iptables]] +# ## iptables require root access on most systems. +# ## Setting 'use_sudo' to true will make use of sudo to run iptables. +# ## Users must configure sudo to allow telegraf user to run iptables with no password. +# ## iptables can be restricted to only list command "iptables -nvL". +# use_sudo = false +# ## Setting 'use_lock' to true runs iptables with the "-w" option. +# ## Adjust your sudo settings appropriately if using this option ("iptables -w 5 -nvl") +# use_lock = false +# ## Define an alternate executable, such as "ip6tables". Default is "iptables". +# # binary = "ip6tables" +# ## defines the table to monitor: +# table = "filter" +# ## defines the chains to monitor. +# ## NOTE: iptables rules without a comment will not be monitored. +# ## Read the plugin documentation for more information. +# chains = [ "INPUT" ] + + +# # Collect virtual and real server stats from Linux IPVS +# [[inputs.ipvs]] +# # no configuration + + +# # Read jobs and cluster metrics from Jenkins instances +# [[inputs.jenkins]] +# ## The Jenkins URL in the format "schema://host:port" +# url = "http://my-jenkins-instance:8080" +# # username = "admin" +# # password = "admin" +# +# ## Set response_timeout +# response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use SSL but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Optional Max Job Build Age filter +# ## Default 1 hour, ignore builds older than max_build_age +# # max_build_age = "1h" +# +# ## Optional Sub Job Depth filter +# ## Jenkins can have unlimited layer of sub jobs +# ## This config will limit the layers of pulling, default value 0 means +# ## unlimited pulling until no more sub jobs +# # max_subjob_depth = 0 +# +# ## Optional Sub Job Per Layer +# ## In workflow-multibranch-plugin, each branch will be created as a sub job. +# ## This config will limit to call only the lasted branches in each layer, +# ## empty will use default value 10 +# # max_subjob_per_layer = 10 +# +# ## Jobs to exclude from gathering +# # job_exclude = [ "job1", "job2/subjob1/subjob2", "job3/*"] +# +# ## Nodes to exclude from gathering +# # node_exclude = [ "node1", "node2" ] +# +# ## Worker pool for jenkins plugin only +# ## Empty this field will use default value 5 +# # max_connections = 5 + + +# # Read JMX metrics through Jolokia +# [[inputs.jolokia]] +# # DEPRECATED: the jolokia plugin has been deprecated in favor of the +# # jolokia2 plugin +# # see https://github.com/influxdata/telegraf/tree/master/plugins/inputs/jolokia2 +# +# ## This is the context root used to compose the jolokia url +# ## NOTE that Jolokia requires a trailing slash at the end of the context root +# ## NOTE that your jolokia security policy must allow for POST requests. +# context = "/jolokia/" +# +# ## This specifies the mode used +# # mode = "proxy" +# # +# ## When in proxy mode this section is used to specify further +# ## proxy address configurations. +# ## Remember to change host address to fit your environment. +# # [inputs.jolokia.proxy] +# # host = "127.0.0.1" +# # port = "8080" +# +# ## Optional http timeouts +# ## +# ## response_header_timeout, if non-zero, specifies the amount of time to wait +# ## for a server's response headers after fully writing the request. +# # response_header_timeout = "3s" +# ## +# ## client_timeout specifies a time limit for requests made by this client. +# ## Includes connection time, any redirects, and reading the response body. +# # client_timeout = "4s" +# +# ## Attribute delimiter +# ## +# ## When multiple attributes are returned for a single +# ## [inputs.jolokia.metrics], the field name is a concatenation of the metric +# ## name, and the attribute name, separated by the given delimiter. +# # delimiter = "_" +# +# ## List of servers exposing jolokia read service +# [[inputs.jolokia.servers]] +# name = "as-server-01" +# host = "127.0.0.1" +# port = "8080" +# # username = "myuser" +# # password = "mypassword" +# +# ## List of metrics collected on above servers +# ## Each metric consists in a name, a jmx path and either +# ## a pass or drop slice attribute. +# ## This collect all heap memory usage metrics. +# [[inputs.jolokia.metrics]] +# name = "heap_memory_usage" +# mbean = "java.lang:type=Memory" +# attribute = "HeapMemoryUsage" +# +# ## This collect thread counts metrics. +# [[inputs.jolokia.metrics]] +# name = "thread_count" +# mbean = "java.lang:type=Threading" +# attribute = "TotalStartedThreadCount,ThreadCount,DaemonThreadCount,PeakThreadCount" +# +# ## This collect number of class loaded/unloaded counts metrics. +# [[inputs.jolokia.metrics]] +# name = "class_count" +# mbean = "java.lang:type=ClassLoading" +# attribute = "LoadedClassCount,UnloadedClassCount,TotalLoadedClassCount" + + +# # Read JMX metrics from a Jolokia REST agent endpoint +# [[inputs.jolokia2_agent]] +# # default_tag_prefix = "" +# # default_field_prefix = "" +# # default_field_separator = "." +# +# # Add agents URLs to query +# urls = ["http://localhost:8080/jolokia"] +# # username = "" +# # password = "" +# # response_timeout = "5s" +# +# ## Optional TLS config +# # tls_ca = "/var/private/ca.pem" +# # tls_cert = "/var/private/client.pem" +# # tls_key = "/var/private/client-key.pem" +# # insecure_skip_verify = false +# +# ## Add metrics to read +# [[inputs.jolokia2_agent.metric]] +# name = "java_runtime" +# mbean = "java.lang:type=Runtime" +# paths = ["Uptime"] + + +# # Read JMX metrics from a Jolokia REST proxy endpoint +# [[inputs.jolokia2_proxy]] +# # default_tag_prefix = "" +# # default_field_prefix = "" +# # default_field_separator = "." +# +# ## Proxy agent +# url = "http://localhost:8080/jolokia" +# # username = "" +# # password = "" +# # response_timeout = "5s" +# +# ## Optional TLS config +# # tls_ca = "/var/private/ca.pem" +# # tls_cert = "/var/private/client.pem" +# # tls_key = "/var/private/client-key.pem" +# # insecure_skip_verify = false +# +# ## Add proxy targets to query +# # default_target_username = "" +# # default_target_password = "" +# [[inputs.jolokia2_proxy.target]] +# url = "service:jmx:rmi:///jndi/rmi://targethost:9999/jmxrmi" +# # username = "" +# # password = "" +# +# ## Add metrics to read +# [[inputs.jolokia2_proxy.metric]] +# name = "java_runtime" +# mbean = "java.lang:type=Runtime" +# paths = ["Uptime"] + + +# # Read Kapacitor-formatted JSON metrics from one or more HTTP endpoints +# [[inputs.kapacitor]] +# ## Multiple URLs from which to read Kapacitor-formatted JSON +# ## Default is "http://localhost:9092/kapacitor/v1/debug/vars". +# urls = [ +# "http://localhost:9092/kapacitor/v1/debug/vars" +# ] +# +# ## Time limit for http requests +# timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Get kernel statistics from /proc/vmstat +# [[inputs.kernel_vmstat]] +# # no configuration + + +# # Read status information from one or more Kibana servers +# [[inputs.kibana]] +# ## Specify a list of one or more Kibana servers +# servers = ["http://localhost:5601"] +# +# ## Timeout for HTTP requests +# timeout = "5s" +# +# ## HTTP Basic Auth credentials +# # username = "username" +# # password = "pa$$word" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics from the Kubernetes api +# [[inputs.kube_inventory]] +# ## URL for the Kubernetes API +# url = "https://127.0.0.1" +# +# ## Namespace to use. Set to "" to use all namespaces. +# # namespace = "default" +# +# ## Use bearer token for authorization. ('bearer_token' takes priority) +# ## If both of these are empty, we'll use the default serviceaccount: +# ## at: /run/secrets/kubernetes.io/serviceaccount/token +# # bearer_token = "/path/to/bearer/token" +# ## OR +# # bearer_token_string = "abc_123" +# +# ## Set response_timeout (default 5 seconds) +# # response_timeout = "5s" +# +# ## Optional Resources to exclude from gathering +# ## Leave them with blank with try to gather everything available. +# ## Values can be - "daemonsets", deployments", "endpoints", "ingress", "nodes", +# ## "persistentvolumes", "persistentvolumeclaims", "pods", "services", "statefulsets" +# # resource_exclude = [ "deployments", "nodes", "statefulsets" ] +# +# ## Optional Resources to include when gathering +# ## Overrides resource_exclude if both set. +# # resource_include = [ "deployments", "nodes", "statefulsets" ] +# +# ## selectors to include and exclude as tags. Globs accepted. +# ## Note that an empty array for both will include all selectors as tags +# ## selector_exclude overrides selector_include if both set. +# # selector_include = [] +# # selector_exclude = ["*"] +# +# ## Optional TLS Config +# # tls_ca = "/path/to/cafile" +# # tls_cert = "/path/to/certfile" +# # tls_key = "/path/to/keyfile" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics from the kubernetes kubelet api +# [[inputs.kubernetes]] +# ## URL for the kubelet +# url = "http://127.0.0.1:10255" +# +# ## Use bearer token for authorization. ('bearer_token' takes priority) +# ## If both of these are empty, we'll use the default serviceaccount: +# ## at: /run/secrets/kubernetes.io/serviceaccount/token +# # bearer_token = "/path/to/bearer/token" +# ## OR +# # bearer_token_string = "abc_123" +# +# ## Pod labels to be added as tags. An empty array for both include and +# ## exclude will include all labels. +# # label_include = [] +# # label_exclude = ["*"] +# +# ## Set response_timeout (default 5 seconds) +# # response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = /path/to/cafile +# # tls_cert = /path/to/certfile +# # tls_key = /path/to/keyfile +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics from a LeoFS Server via SNMP +# [[inputs.leofs]] +# ## An array of URLs of the form: +# ## host [ ":" port] +# servers = ["127.0.0.1:4020"] + + +# # Provides Linux sysctl fs metrics +# [[inputs.linux_sysctl_fs]] +# # no configuration + + +# # Read metrics exposed by Logstash +# [[inputs.logstash]] +# ## The URL of the exposed Logstash API endpoint. +# url = "http://127.0.0.1:9600" +# +# ## Use Logstash 5 single pipeline API, set to true when monitoring +# ## Logstash 5. +# # single_pipeline = false +# +# ## Enable optional collection components. Can contain +# ## "pipelines", "process", and "jvm". +# # collect = ["pipelines", "process", "jvm"] +# +# ## Timeout for HTTP requests. +# # timeout = "5s" +# +# ## Optional HTTP Basic Auth credentials. +# # username = "username" +# # password = "pa$$word" +# +# ## Optional TLS Config. +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## Use TLS but skip chain & host verification. +# # insecure_skip_verify = false +# +# ## Optional HTTP headers. +# # [inputs.logstash.headers] +# # "X-Special-Header" = "Special-Value" + + +# # Read metrics from local Lustre service on OST, MDS +# [[inputs.lustre2]] +# ## An array of /proc globs to search for Lustre stats +# ## If not specified, the default will work on Lustre 2.5.x +# ## +# # ost_procfiles = [ +# # "/proc/fs/lustre/obdfilter/*/stats", +# # "/proc/fs/lustre/osd-ldiskfs/*/stats", +# # "/proc/fs/lustre/obdfilter/*/job_stats", +# # ] +# # mds_procfiles = [ +# # "/proc/fs/lustre/mdt/*/md_stats", +# # "/proc/fs/lustre/mdt/*/job_stats", +# # ] + + +# # Gathers metrics from the /3.0/reports MailChimp API +# [[inputs.mailchimp]] +# ## MailChimp API key +# ## get from https://admin.mailchimp.com/account/api/ +# api_key = "" # required +# ## Reports for campaigns sent more than days_old ago will not be collected. +# ## 0 means collect all. +# days_old = 0 +# ## Campaign ID to get, if empty gets all campaigns, this option overrides days_old +# # campaign_id = "" + + +# # Retrieves information on a specific host in a MarkLogic Cluster +# [[inputs.marklogic]] +# ## Base URL of the MarkLogic HTTP Server. +# url = "http://localhost:8002" +# +# ## List of specific hostnames to retrieve information. At least (1) required. +# # hosts = ["hostname1", "hostname2"] +# +# ## Using HTTP Basic Authentication. Management API requires 'manage-user' role privileges +# # username = "myuser" +# # password = "mypassword" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics from one or many mcrouter servers +# [[inputs.mcrouter]] +# ## An array of address to gather stats about. Specify an ip or hostname +# ## with port. ie tcp://localhost:11211, tcp://10.0.0.1:11211, etc. +# servers = ["tcp://localhost:11211", "unix:///var/run/mcrouter.sock"] +# +# ## Timeout for metric collections from all servers. Minimum timeout is "1s". +# # timeout = "5s" + + +# # Read metrics from one or many memcached servers +# [[inputs.memcached]] +# ## An array of address to gather stats about. Specify an ip on hostname +# ## with optional port. ie localhost, 10.0.0.1:11211, etc. +# servers = ["localhost:11211"] +# # unix_sockets = ["/var/run/memcached.sock"] + + +# # Telegraf plugin for gathering metrics from N Mesos masters +# [[inputs.mesos]] +# ## Timeout, in ms. +# timeout = 100 +# +# ## A list of Mesos masters. +# masters = ["http://localhost:5050"] +# +# ## Master metrics groups to be collected, by default, all enabled. +# master_collections = [ +# "resources", +# "master", +# "system", +# "agents", +# "frameworks", +# "framework_offers", +# "tasks", +# "messages", +# "evqueue", +# "registrar", +# "allocator", +# ] +# +# ## A list of Mesos slaves, default is [] +# # slaves = [] +# +# ## Slave metrics groups to be collected, by default, all enabled. +# # slave_collections = [ +# # "resources", +# # "agent", +# # "system", +# # "executors", +# # "tasks", +# # "messages", +# # ] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Collects scores from a Minecraft server's scoreboard using the RCON protocol +# [[inputs.minecraft]] +# ## Address of the Minecraft server. +# # server = "localhost" +# +# ## Server RCON Port. +# # port = "25575" +# +# ## Server RCON Password. +# password = "" +# +# ## Uncomment to remove deprecated metric components. +# # tagdrop = ["server"] + + +# # Retrieve data from MODBUS slave devices +# [[inputs.modbus]] +# ## Connection Configuration +# ## +# ## The plugin supports connections to PLCs via MODBUS/TCP or +# ## via serial line communication in binary (RTU) or readable (ASCII) encoding +# ## +# ## Device name +# name = "Device" +# +# ## Slave ID - addresses a MODBUS device on the bus +# ## Range: 0 - 255 [0 = broadcast; 248 - 255 = reserved] +# slave_id = 1 +# +# ## Timeout for each request +# timeout = "1s" +# +# ## Maximum number of retries and the time to wait between retries +# ## when a slave-device is busy. +# # busy_retries = 0 +# # busy_retries_wait = "100ms" +# +# # TCP - connect via Modbus/TCP +# controller = "tcp://localhost:502" +# +# ## Serial (RS485; RS232) +# # controller = "file:///dev/ttyUSB0" +# # baud_rate = 9600 +# # data_bits = 8 +# # parity = "N" +# # stop_bits = 1 +# # transmission_mode = "RTU" +# +# +# ## Measurements +# ## +# +# ## Digital Variables, Discrete Inputs and Coils +# ## measurement - the (optional) measurement name, defaults to "modbus" +# ## name - the variable name +# ## address - variable address +# +# discrete_inputs = [ +# { name = "start", address = [0]}, +# { name = "stop", address = [1]}, +# { name = "reset", address = [2]}, +# { name = "emergency_stop", address = [3]}, +# ] +# coils = [ +# { name = "motor1_run", address = [0]}, +# { name = "motor1_jog", address = [1]}, +# { name = "motor1_stop", address = [2]}, +# ] +# +# ## Analog Variables, Input Registers and Holding Registers +# ## measurement - the (optional) measurement name, defaults to "modbus" +# ## name - the variable name +# ## byte_order - the ordering of bytes +# ## |---AB, ABCD - Big Endian +# ## |---BA, DCBA - Little Endian +# ## |---BADC - Mid-Big Endian +# ## |---CDAB - Mid-Little Endian +# ## data_type - INT16, UINT16, INT32, UINT32, INT64, UINT64, FLOAT32-IEEE (the IEEE 754 binary representation) +# ## FLOAT32, FIXED, UFIXED (fixed-point representation on input) +# ## scale - the final numeric variable representation +# ## address - variable address +# +# holding_registers = [ +# { name = "power_factor", byte_order = "AB", data_type = "FIXED", scale=0.01, address = [8]}, +# { name = "voltage", byte_order = "AB", data_type = "FIXED", scale=0.1, address = [0]}, +# { name = "energy", byte_order = "ABCD", data_type = "FIXED", scale=0.001, address = [5,6]}, +# { name = "current", byte_order = "ABCD", data_type = "FIXED", scale=0.001, address = [1,2]}, +# { name = "frequency", byte_order = "AB", data_type = "UFIXED", scale=0.1, address = [7]}, +# { name = "power", byte_order = "ABCD", data_type = "UFIXED", scale=0.1, address = [3,4]}, +# ] +# input_registers = [ +# { name = "tank_level", byte_order = "AB", data_type = "INT16", scale=1.0, address = [0]}, +# { name = "tank_ph", byte_order = "AB", data_type = "INT16", scale=1.0, address = [1]}, +# { name = "pump1_speed", byte_order = "ABCD", data_type = "INT32", scale=1.0, address = [3,4]}, +# ] + + +# # Read metrics from one or many MongoDB servers +# [[inputs.mongodb]] +# ## An array of URLs of the form: +# ## "mongodb://" [user ":" pass "@"] host [ ":" port] +# ## For example: +# ## mongodb://user:auth_key@10.10.3.30:27017, +# ## mongodb://10.10.3.33:18832, +# servers = ["mongodb://127.0.0.1:27017"] +# +# ## When true, collect cluster status +# ## Note that the query that counts jumbo chunks triggers a COLLSCAN, which +# ## may have an impact on performance. +# # gather_cluster_status = true +# +# ## When true, collect per database stats +# # gather_perdb_stats = false +# +# ## When true, collect per collection stats +# # gather_col_stats = false +# +# ## List of db where collections stats are collected +# ## If empty, all db are concerned +# # col_stats_dbs = ["local"] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics and status information about processes managed by Monit +# [[inputs.monit]] +# ## Monit HTTPD address +# address = "http://127.0.0.1:2812" +# +# ## Username and Password for Monit +# # username = "" +# # password = "" +# +# ## Amount of time allowed to complete the HTTP request +# # timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Aggregates the contents of multiple files into a single point +# [[inputs.multifile]] +# ## Base directory where telegraf will look for files. +# ## Omit this option to use absolute paths. +# base_dir = "/sys/bus/i2c/devices/1-0076/iio:device0" +# +# ## If true, Telegraf discard all data when a single file can't be read. +# ## Else, Telegraf omits the field generated from this file. +# # fail_early = true +# +# ## Files to parse each interval. +# [[inputs.multifile.file]] +# file = "in_pressure_input" +# dest = "pressure" +# conversion = "float" +# [[inputs.multifile.file]] +# file = "in_temp_input" +# dest = "temperature" +# conversion = "float(3)" +# [[inputs.multifile.file]] +# file = "in_humidityrelative_input" +# dest = "humidityrelative" +# conversion = "float(3)" + + +# # Read metrics from one or many mysql servers +# [[inputs.mysql]] +# ## specify servers via a url matching: +# ## [username[:password]@][protocol[(address)]]/[?tls=[true|false|skip-verify|custom]] +# ## see https://github.com/go-sql-driver/mysql#dsn-data-source-name +# ## e.g. +# ## servers = ["user:passwd@tcp(127.0.0.1:3306)/?tls=false"] +# ## servers = ["user@tcp(127.0.0.1:3306)/?tls=false"] +# # +# ## If no servers are specified, then localhost is used as the host. +# servers = ["tcp(127.0.0.1:3306)/"] +# +# ## Selects the metric output format. +# ## +# ## This option exists to maintain backwards compatibility, if you have +# ## existing metrics do not set or change this value until you are ready to +# ## migrate to the new format. +# ## +# ## If you do not have existing metrics from this plugin set to the latest +# ## version. +# ## +# ## Telegraf >=1.6: metric_version = 2 +# ## <1.6: metric_version = 1 (or unset) +# metric_version = 2 +# +# ## if the list is empty, then metrics are gathered from all database tables +# # table_schema_databases = [] +# +# ## gather metrics from INFORMATION_SCHEMA.TABLES for databases provided above list +# # gather_table_schema = false +# +# ## gather thread state counts from INFORMATION_SCHEMA.PROCESSLIST +# # gather_process_list = false +# +# ## gather user statistics from INFORMATION_SCHEMA.USER_STATISTICS +# # gather_user_statistics = false +# +# ## gather auto_increment columns and max values from information schema +# # gather_info_schema_auto_inc = false +# +# ## gather metrics from INFORMATION_SCHEMA.INNODB_METRICS +# # gather_innodb_metrics = false +# +# ## gather metrics from SHOW SLAVE STATUS command output +# # gather_slave_status = false +# +# ## gather metrics from SHOW BINARY LOGS command output +# # gather_binary_logs = false +# +# ## gather metrics from PERFORMANCE_SCHEMA.GLOBAL_VARIABLES +# # gather_global_variables = true +# +# ## gather metrics from PERFORMANCE_SCHEMA.TABLE_IO_WAITS_SUMMARY_BY_TABLE +# # gather_table_io_waits = false +# +# ## gather metrics from PERFORMANCE_SCHEMA.TABLE_LOCK_WAITS +# # gather_table_lock_waits = false +# +# ## gather metrics from PERFORMANCE_SCHEMA.TABLE_IO_WAITS_SUMMARY_BY_INDEX_USAGE +# # gather_index_io_waits = false +# +# ## gather metrics from PERFORMANCE_SCHEMA.EVENT_WAITS +# # gather_event_waits = false +# +# ## gather metrics from PERFORMANCE_SCHEMA.FILE_SUMMARY_BY_EVENT_NAME +# # gather_file_events_stats = false +# +# ## gather metrics from PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_DIGEST +# # gather_perf_events_statements = false +# +# ## the limits for metrics form perf_events_statements +# # perf_events_statements_digest_text_limit = 120 +# # perf_events_statements_limit = 250 +# # perf_events_statements_time_limit = 86400 +# +# ## Some queries we may want to run less often (such as SHOW GLOBAL VARIABLES) +# ## example: interval_slow = "30m" +# # interval_slow = "" +# +# ## Optional TLS Config (will be used if tls=custom parameter specified in server uri) +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Provides metrics about the state of a NATS server +# [[inputs.nats]] +# ## The address of the monitoring endpoint of the NATS server +# server = "http://localhost:8222" +# +# ## Maximum time to receive response +# # response_timeout = "5s" + + +# # Neptune Apex data collector +# [[inputs.neptune_apex]] +# ## The Neptune Apex plugin reads the publicly available status.xml data from a local Apex. +# ## Measurements will be logged under "apex". +# +# ## The base URL of the local Apex(es). If you specify more than one server, they will +# ## be differentiated by the "source" tag. +# servers = [ +# "http://apex.local", +# ] +# +# ## The response_timeout specifies how long to wait for a reply from the Apex. +# #response_timeout = "5s" + + +# # Read metrics about network interface usage +[[inputs.net]] +# ## By default, telegraf gathers stats from any up interface (excluding loopback) +# ## Setting interfaces will tell it to gather these explicit interfaces, +# ## regardless of status. +# ## +# # interfaces = ["eth0"] +# ## +# ## On linux systems telegraf also collects protocol stats. +# ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. +# ## +# # ignore_protocol_stats = false +# ## + + +# # Collect response time of a TCP or UDP connection +# [[inputs.net_response]] +# ## Protocol, must be "tcp" or "udp" +# ## NOTE: because the "udp" protocol does not respond to requests, it requires +# ## a send/expect string pair (see below). +# protocol = "tcp" +# ## Server address (default localhost) +# address = "localhost:80" +# +# ## Set timeout +# # timeout = "1s" +# +# ## Set read timeout (only used if expecting a response) +# # read_timeout = "1s" +# +# ## The following options are required for UDP checks. For TCP, they are +# ## optional. The plugin will send the given string to the server and then +# ## expect to receive the given 'expect' string back. +# ## string sent to the server +# # send = "ssh" +# ## expected string in answer +# # expect = "ssh" +# +# ## Uncomment to remove deprecated fields +# # fielddrop = ["result_type", "string_found"] + + +# # Read TCP metrics such as established, time wait and sockets counts. +# [[inputs.netstat]] +# # no configuration + + +# # Read Nginx's basic status information (ngx_http_stub_status_module) +# [[inputs.nginx]] +# # An array of Nginx stub_status URI to gather stats. +# urls = ["http://localhost/server_status"] +# +# ## Optional TLS Config +# tls_ca = "/etc/telegraf/ca.pem" +# tls_cert = "/etc/telegraf/cert.cer" +# tls_key = "/etc/telegraf/key.key" +# ## Use TLS but skip chain & host verification +# insecure_skip_verify = false +# +# # HTTP response timeout (default: 5s) +# response_timeout = "5s" + + +# # Read Nginx Plus' full status information (ngx_http_status_module) +# [[inputs.nginx_plus]] +# ## An array of ngx_http_status_module or status URI to gather stats. +# urls = ["http://localhost/status"] +# +# # HTTP response timeout (default: 5s) +# response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read Nginx Plus Api documentation +# [[inputs.nginx_plus_api]] +# ## An array of API URI to gather stats. +# urls = ["http://localhost/api"] +# +# # Nginx API version, default: 3 +# # api_version = 3 +# +# # HTTP response timeout (default: 5s) +# response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read Nginx virtual host traffic status module information (nginx-module-sts) +# [[inputs.nginx_sts]] +# ## An array of ngx_http_status_module or status URI to gather stats. +# urls = ["http://localhost/status"] +# +# ## HTTP response timeout (default: 5s) +# response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read nginx_upstream_check module status information (https://github.com/yaoweibin/nginx_upstream_check_module) +# [[inputs.nginx_upstream_check]] +# ## An URL where Nginx Upstream check module is enabled +# ## It should be set to return a JSON formatted response +# url = "http://127.0.0.1/status?format=json" +# +# ## HTTP method +# # method = "GET" +# +# ## Optional HTTP headers +# # headers = {"X-Special-Header" = "Special-Value"} +# +# ## Override HTTP "Host" header +# # host_header = "check.example.com" +# +# ## Timeout for HTTP requests +# timeout = "5s" +# +# ## Optional HTTP Basic Auth credentials +# # username = "username" +# # password = "pa$$word" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read Nginx virtual host traffic status module information (nginx-module-vts) +# [[inputs.nginx_vts]] +# ## An array of ngx_http_status_module or status URI to gather stats. +# urls = ["http://localhost/status"] +# +# ## HTTP response timeout (default: 5s) +# response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read NSQ topic and channel statistics. +# [[inputs.nsq]] +# ## An array of NSQD HTTP API endpoints +# endpoints = ["http://localhost:4151"] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Collect kernel snmp counters and network interface statistics +# [[inputs.nstat]] +# ## file paths for proc files. If empty default paths will be used: +# ## /proc/net/netstat, /proc/net/snmp, /proc/net/snmp6 +# ## These can also be overridden with env variables, see README. +# proc_net_netstat = "/proc/net/netstat" +# proc_net_snmp = "/proc/net/snmp" +# proc_net_snmp6 = "/proc/net/snmp6" +# ## dump metrics with 0 values too +# dump_zeros = true + + +# # Get standard NTP query metrics, requires ntpq executable. +# [[inputs.ntpq]] +# ## If false, set the -n ntpq flag. Can reduce metric gather time. +# dns_lookup = true + + +# # Pulls statistics from nvidia GPUs attached to the host +##[[inputs.nvidia_smi]] +# ## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath +# # bin_path = "/usr/bin/nvidia-smi" +# +# ## Optional: timeout for GPU polling +# # timeout = "5s" + +[[inputs.amd_rocm_smi]] + +# # OpenLDAP cn=Monitor plugin +# [[inputs.openldap]] +# host = "localhost" +# port = 389 +# +# # ldaps, starttls, or no encryption. default is an empty string, disabling all encryption. +# # note that port will likely need to be changed to 636 for ldaps +# # valid options: "" | "starttls" | "ldaps" +# tls = "" +# +# # skip peer certificate verification. Default is false. +# insecure_skip_verify = false +# +# # Path to PEM-encoded Root certificate to use to verify server certificate +# tls_ca = "/etc/ssl/certs.pem" +# +# # dn/password to bind with. If bind_dn is empty, an anonymous bind is performed. +# bind_dn = "" +# bind_password = "" +# +# # Reverse metric names so they sort more naturally. Recommended. +# # This defaults to false if unset, but is set to true when generating a new config +# reverse_metric_names = true + + +# # Get standard NTP query metrics from OpenNTPD. +# [[inputs.openntpd]] +# ## Run ntpctl binary with sudo. +# # use_sudo = false +# +# ## Location of the ntpctl binary. +# # binary = "/usr/sbin/ntpctl" +# +# ## Maximum time the ntpctl binary is allowed to run. +# # timeout = "5ms" + + +# # A plugin to collect stats from Opensmtpd - a validating, recursive, and caching DNS resolver +# [[inputs.opensmtpd]] +# ## If running as a restricted user you can prepend sudo for additional access: +# #use_sudo = false +# +# ## The default location of the smtpctl binary can be overridden with: +# binary = "/usr/sbin/smtpctl" +# +# ## The default timeout of 1000ms can be overridden with (in milliseconds): +# timeout = 1000 + + +# # Read current weather and forecasts data from openweathermap.org +# [[inputs.openweathermap]] +# ## OpenWeatherMap API key. +# app_id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +# +# ## City ID's to collect weather data from. +# city_id = ["5391959"] +# +# ## Language of the description field. Can be one of "ar", "bg", +# ## "ca", "cz", "de", "el", "en", "fa", "fi", "fr", "gl", "hr", "hu", +# ## "it", "ja", "kr", "la", "lt", "mk", "nl", "pl", "pt", "ro", "ru", +# ## "se", "sk", "sl", "es", "tr", "ua", "vi", "zh_cn", "zh_tw" +# # lang = "en" +# +# ## APIs to fetch; can contain "weather" or "forecast". +# fetch = ["weather", "forecast"] +# +# ## OpenWeatherMap base URL +# # base_url = "https://api.openweathermap.org/" +# +# ## Timeout for HTTP response. +# # response_timeout = "5s" +# +# ## Preferred unit system for temperature and wind speed. Can be one of +# ## "metric", "imperial", or "standard". +# # units = "metric" +# +# ## Query interval; OpenWeatherMap updates their weather data every 10 +# ## minutes. +# interval = "10m" + + +# # Read metrics of passenger using passenger-status +# [[inputs.passenger]] +# ## Path of passenger-status. +# ## +# ## Plugin gather metric via parsing XML output of passenger-status +# ## More information about the tool: +# ## https://www.phusionpassenger.com/library/admin/apache/overall_status_report.html +# ## +# ## If no path is specified, then the plugin simply execute passenger-status +# ## hopefully it can be found in your PATH +# command = "passenger-status -v --show=xml" + + +# # Gather counters from PF +# [[inputs.pf]] +# ## PF require root access on most systems. +# ## Setting 'use_sudo' to true will make use of sudo to run pfctl. +# ## Users must configure sudo to allow telegraf user to run pfctl with no password. +# ## pfctl can be restricted to only list command "pfctl -s info". +# use_sudo = false + + +# # Read metrics of phpfpm, via HTTP status page or socket +# [[inputs.phpfpm]] +# ## An array of addresses to gather stats about. Specify an ip or hostname +# ## with optional port and path +# ## +# ## Plugin can be configured in three modes (either can be used): +# ## - http: the URL must start with http:// or https://, ie: +# ## "http://localhost/status" +# ## "http://192.168.130.1/status?full" +# ## +# ## - unixsocket: path to fpm socket, ie: +# ## "/var/run/php5-fpm.sock" +# ## or using a custom fpm status path: +# ## "/var/run/php5-fpm.sock:fpm-custom-status-path" +# ## +# ## - fcgi: the URL must start with fcgi:// or cgi://, and port must be present, ie: +# ## "fcgi://10.0.0.12:9000/status" +# ## "cgi://10.0.10.12:9001/status" +# ## +# ## Example of multiple gathering from local socket and remote host +# ## urls = ["http://192.168.1.20/status", "/tmp/fpm.sock"] +# urls = ["http://localhost/status"] +# +# ## Duration allowed to complete HTTP requests. +# # timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Ping given url(s) and return statistics +# [[inputs.ping]] +# ## Hosts to send ping packets to. +# urls = ["example.org"] +# +# ## Method used for sending pings, can be either "exec" or "native". When set +# ## to "exec" the systems ping command will be executed. When set to "native" +# ## the plugin will send pings directly. +# ## +# ## While the default is "exec" for backwards compatibility, new deployments +# ## are encouraged to use the "native" method for improved compatibility and +# ## performance. +# # method = "exec" +# +# ## Number of ping packets to send per interval. Corresponds to the "-c" +# ## option of the ping command. +# # count = 1 +# +# ## Time to wait between sending ping packets in seconds. Operates like the +# ## "-i" option of the ping command. +# # ping_interval = 1.0 +# +# ## If set, the time to wait for a ping response in seconds. Operates like +# ## the "-W" option of the ping command. +# # timeout = 1.0 +# +# ## If set, the total ping deadline, in seconds. Operates like the -w option +# ## of the ping command. +# # deadline = 10 +# +# ## Interface or source address to send ping from. Operates like the -I or -S +# ## option of the ping command. +# # interface = "" +# +# ## Specify the ping executable binary. +# # binary = "ping" +# +# ## Arguments for ping command. When arguments is not empty, the command from +# ## the binary option will be used and other options (ping_interval, timeout, +# ## etc) will be ignored. +# # arguments = ["-c", "3"] +# +# ## Use only IPv6 addresses when resolving a hostname. +# # ipv6 = false + + +# # Measure postfix queue statistics +# [[inputs.postfix]] +# ## Postfix queue directory. If not provided, telegraf will try to use +# ## 'postconf -h queue_directory' to determine it. +# # queue_directory = "/var/spool/postfix" + + +# # Read metrics from one or many PowerDNS servers +# [[inputs.powerdns]] +# ## An array of sockets to gather stats about. +# ## Specify a path to unix socket. +# unix_sockets = ["/var/run/pdns.controlsocket"] + + +# # Read metrics from one or many PowerDNS Recursor servers +# [[inputs.powerdns_recursor]] +# ## Path to the Recursor control socket. +# unix_sockets = ["/var/run/pdns_recursor.controlsocket"] +# +# ## Directory to create receive socket. This default is likely not writable, +# ## please reference the full plugin documentation for a recommended setup. +# # socket_dir = "/var/run/" +# ## Socket permissions for the receive socket. +# # socket_mode = "0666" + + +# # Monitor process cpu and memory usage +# [[inputs.procstat]] +# ## PID file to monitor process +# pid_file = "/var/run/nginx.pid" +# ## executable name (ie, pgrep ) +# # exe = "nginx" +# ## pattern as argument for pgrep (ie, pgrep -f ) +# # pattern = "nginx" +# ## user as argument for pgrep (ie, pgrep -u ) +# # user = "nginx" +# ## Systemd unit name +# # systemd_unit = "nginx.service" +# ## CGroup name or path +# # cgroup = "systemd/system.slice/nginx.service" +# +# ## Windows service name +# # win_service = "" +# +# ## override for process_name +# ## This is optional; default is sourced from /proc//status +# # process_name = "bar" +# +# ## Field name prefix +# # prefix = "" +# +# ## When true add the full cmdline as a tag. +# # cmdline_tag = false +# +# ## Add the PID as a tag instead of as a field. When collecting multiple +# ## processes with otherwise matching tags this setting should be enabled to +# ## ensure each process has a unique identity. +# ## +# ## Enabling this option may result in a large number of series, especially +# ## when processes have a short lifetime. +# # pid_tag = false +# +# ## Method to use when finding process IDs. Can be one of 'pgrep', or +# ## 'native'. The pgrep finder calls the pgrep executable in the PATH while +# ## the native finder performs the search directly in a manor dependent on the +# ## platform. Default is 'pgrep' +# # pid_finder = "pgrep" + + +# # Reads last_run_summary.yaml file and converts to measurements +# [[inputs.puppetagent]] +# ## Location of puppet last run summary file +# location = "/var/lib/puppet/state/last_run_summary.yaml" + + +# # Reads metrics from RabbitMQ servers via the Management Plugin +# [[inputs.rabbitmq]] +# ## Management Plugin url. (default: http://localhost:15672) +# # url = "http://localhost:15672" +# ## Tag added to rabbitmq_overview series; deprecated: use tags +# # name = "rmq-server-1" +# ## Credentials +# # username = "guest" +# # password = "guest" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Optional request timeouts +# ## +# ## ResponseHeaderTimeout, if non-zero, specifies the amount of time to wait +# ## for a server's response headers after fully writing the request. +# # header_timeout = "3s" +# ## +# ## client_timeout specifies a time limit for requests made by this client. +# ## Includes connection time, any redirects, and reading the response body. +# # client_timeout = "4s" +# +# ## A list of nodes to gather as the rabbitmq_node measurement. If not +# ## specified, metrics for all nodes are gathered. +# # nodes = ["rabbit@node1", "rabbit@node2"] +# +# ## A list of queues to gather as the rabbitmq_queue measurement. If not +# ## specified, metrics for all queues are gathered. +# # queues = ["telegraf"] +# +# ## A list of exchanges to gather as the rabbitmq_exchange measurement. If not +# ## specified, metrics for all exchanges are gathered. +# # exchanges = ["telegraf"] +# +# ## Queues to include and exclude. Globs accepted. +# ## Note that an empty array for both will include all queues +# queue_name_include = [] +# queue_name_exclude = [] +# +# ## Federation upstreams include and exclude when gathering the rabbitmq_federation measurement. +# ## If neither are specified, metrics for all federation upstreams are gathered. +# ## Federation link metrics will only be gathered for queues and exchanges +# ## whose non-federation metrics will be collected (e.g a queue excluded +# ## by the 'queue_name_exclude' option will also be excluded from federation). +# ## Globs accepted. +# # federation_upstream_include = ["dataCentre-*"] +# # federation_upstream_exclude = [] + + +# # Read raindrops stats (raindrops - real-time stats for preforking Rack servers) +# [[inputs.raindrops]] +# ## An array of raindrops middleware URI to gather stats. +# urls = ["http://localhost:8080/_raindrops"] + + +# # Read CPU, Fans, Powersupply and Voltage metrics of hardware server through redfish APIs +# [[inputs.redfish]] +# ## Server url +# address = "https://127.0.0.1:5000" +# +# ## Username, Password for hardware server +# username = "root" +# password = "password123456" +# +# ## ComputerSystemId +# computer_system_id="2M220100SL" +# +# ## Amount of time allowed to complete the HTTP request +# # timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics from one or many redis servers +# [[inputs.redis]] +# ## specify servers via a url matching: +# ## [protocol://][:password]@address[:port] +# ## e.g. +# ## tcp://localhost:6379 +# ## tcp://:password@192.168.99.100 +# ## unix:///var/run/redis.sock +# ## +# ## If no servers are specified, then localhost is used as the host. +# ## If no port is specified, 6379 is used +# servers = ["tcp://localhost:6379"] +# +# ## specify server password +# # password = "s#cr@t%" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = true + + +# # Read metrics from one or many RethinkDB servers +# [[inputs.rethinkdb]] +# ## An array of URI to gather stats about. Specify an ip or hostname +# ## with optional port add password. ie, +# ## rethinkdb://user:auth_key@10.10.3.30:28105, +# ## rethinkdb://10.10.3.33:18832, +# ## 10.0.0.1:10000, etc. +# servers = ["127.0.0.1:28015"] +# ## +# ## If you use actual rethinkdb of > 2.3.0 with username/password authorization, +# ## protocol have to be named "rethinkdb2" - it will use 1_0 H. +# # servers = ["rethinkdb2://username:password@127.0.0.1:28015"] +# ## +# ## If you use older versions of rethinkdb (<2.2) with auth_key, protocol +# ## have to be named "rethinkdb". +# # servers = ["rethinkdb://username:auth_key@127.0.0.1:28015"] + + +# # Read metrics one or many Riak servers +# [[inputs.riak]] +# # Specify a list of one or more riak http servers +# servers = ["http://localhost:8098"] + + +# # Read API usage and limits for a Salesforce organisation +# [[inputs.salesforce]] +# ## specify your credentials +# ## +# username = "your_username" +# password = "your_password" +# ## +# ## (optional) security token +# # security_token = "your_security_token" +# ## +# ## (optional) environment type (sandbox or production) +# ## default is: production +# ## +# # environment = "production" +# ## +# ## (optional) API version (default: "39.0") +# ## +# # version = "39.0" + + +# # Monitor sensors, requires lm-sensors package +# [[inputs.sensors]] +# ## Remove numbers from field names. +# ## If true, a field name like 'temp1_input' will be changed to 'temp_input'. +# # remove_numbers = true +# +# ## Timeout is the maximum amount of time that the sensors command can run. +# # timeout = "5s" + + +# # Read metrics from storage devices supporting S.M.A.R.T. +# [[inputs.smart]] +# ## Optionally specify the path to the smartctl executable +# # path = "/usr/bin/smartctl" +# +# ## On most platforms smartctl requires root access. +# ## Setting 'use_sudo' to true will make use of sudo to run smartctl. +# ## Sudo must be configured to to allow the telegraf user to run smartctl +# ## without a password. +# # use_sudo = false +# +# ## Skip checking disks in this power mode. Defaults to +# ## "standby" to not wake up disks that have stoped rotating. +# ## See --nocheck in the man pages for smartctl. +# ## smartctl version 5.41 and 5.42 have faulty detection of +# ## power mode and might require changing this value to +# ## "never" depending on your disks. +# # nocheck = "standby" +# +# ## Gather all returned S.M.A.R.T. attribute metrics and the detailed +# ## information from each drive into the 'smart_attribute' measurement. +# # attributes = false +# +# ## Optionally specify devices to exclude from reporting. +# # excludes = [ "/dev/pass6" ] +# +# ## Optionally specify devices and device type, if unset +# ## a scan (smartctl --scan) for S.M.A.R.T. devices will +# ## done and all found will be included except for the +# ## excluded in excludes. +# # devices = [ "/dev/ada0 -d atacam" ] +# +# ## Timeout for the smartctl command to complete. +# # timeout = "30s" + + +# # Retrieves SNMP values from remote agents +# [[inputs.snmp]] +# ## Agent addresses to retrieve values from. +# ## example: agents = ["udp://127.0.0.1:161"] +# ## agents = ["tcp://127.0.0.1:161"] +# agents = ["udp://127.0.0.1:161"] +# +# ## Timeout for each request. +# # timeout = "5s" +# +# ## SNMP version; can be 1, 2, or 3. +# # version = 2 +# +# ## SNMP community string. +# # community = "public" +# +# ## Number of retries to attempt. +# # retries = 3 +# +# ## The GETBULK max-repetitions parameter. +# # max_repetitions = 10 +# +# ## SNMPv3 authentication and encryption options. +# ## +# ## Security Name. +# # sec_name = "myuser" +# ## Authentication protocol; one of "MD5", "SHA", or "". +# # auth_protocol = "MD5" +# ## Authentication password. +# # auth_password = "pass" +# ## Security Level; one of "noAuthNoPriv", "authNoPriv", or "authPriv". +# # sec_level = "authNoPriv" +# ## Context Name. +# # context_name = "" +# ## Privacy protocol used for encrypted messages; one of "DES", "AES" or "". +# # priv_protocol = "" +# ## Privacy password used for encrypted messages. +# # priv_password = "" +# +# ## Add fields and tables defining the variables you wish to collect. This +# ## example collects the system uptime and interface variables. Reference the +# ## full plugin documentation for configuration details. + + +# # DEPRECATED! PLEASE USE inputs.snmp INSTEAD. +# [[inputs.snmp_legacy]] +# ## Use 'oids.txt' file to translate oids to names +# ## To generate 'oids.txt' you need to run: +# ## snmptranslate -m all -Tz -On | sed -e 's/"//g' > /tmp/oids.txt +# ## Or if you have an other MIB folder with custom MIBs +# ## snmptranslate -M /mycustommibfolder -Tz -On -m all | sed -e 's/"//g' > oids.txt +# snmptranslate_file = "/tmp/oids.txt" +# [[inputs.snmp.host]] +# address = "192.168.2.2:161" +# # SNMP community +# community = "public" # default public +# # SNMP version (1, 2 or 3) +# # Version 3 not supported yet +# version = 2 # default 2 +# # SNMP response timeout +# timeout = 2.0 # default 2.0 +# # SNMP request retries +# retries = 2 # default 2 +# # Which get/bulk do you want to collect for this host +# collect = ["mybulk", "sysservices", "sysdescr"] +# # Simple list of OIDs to get, in addition to "collect" +# get_oids = [] +# +# [[inputs.snmp.host]] +# address = "192.168.2.3:161" +# community = "public" +# version = 2 +# timeout = 2.0 +# retries = 2 +# collect = ["mybulk"] +# get_oids = [ +# "ifNumber", +# ".1.3.6.1.2.1.1.3.0", +# ] +# +# [[inputs.snmp.get]] +# name = "ifnumber" +# oid = "ifNumber" +# +# [[inputs.snmp.get]] +# name = "interface_speed" +# oid = "ifSpeed" +# instance = "0" +# +# [[inputs.snmp.get]] +# name = "sysuptime" +# oid = ".1.3.6.1.2.1.1.3.0" +# unit = "second" +# +# [[inputs.snmp.bulk]] +# name = "mybulk" +# max_repetition = 127 +# oid = ".1.3.6.1.2.1.1" +# +# [[inputs.snmp.bulk]] +# name = "ifoutoctets" +# max_repetition = 127 +# oid = "ifOutOctets" +# +# [[inputs.snmp.host]] +# address = "192.168.2.13:161" +# #address = "127.0.0.1:161" +# community = "public" +# version = 2 +# timeout = 2.0 +# retries = 2 +# #collect = ["mybulk", "sysservices", "sysdescr", "systype"] +# collect = ["sysuptime" ] +# [[inputs.snmp.host.table]] +# name = "iftable3" +# include_instances = ["enp5s0", "eth1"] +# +# # SNMP TABLEs +# # table without mapping neither subtables +# [[inputs.snmp.table]] +# name = "iftable1" +# oid = ".1.3.6.1.2.1.31.1.1.1" +# +# # table without mapping but with subtables +# [[inputs.snmp.table]] +# name = "iftable2" +# oid = ".1.3.6.1.2.1.31.1.1.1" +# sub_tables = [".1.3.6.1.2.1.2.2.1.13"] +# +# # table with mapping but without subtables +# [[inputs.snmp.table]] +# name = "iftable3" +# oid = ".1.3.6.1.2.1.31.1.1.1" +# # if empty. get all instances +# mapping_table = ".1.3.6.1.2.1.31.1.1.1.1" +# # if empty, get all subtables +# +# # table with both mapping and subtables +# [[inputs.snmp.table]] +# name = "iftable4" +# oid = ".1.3.6.1.2.1.31.1.1.1" +# # if empty get all instances +# mapping_table = ".1.3.6.1.2.1.31.1.1.1.1" +# # if empty get all subtables +# # sub_tables could be not "real subtables" +# sub_tables=[".1.3.6.1.2.1.2.2.1.13", "bytes_recv", "bytes_send"] + + +# # Read stats from one or more Solr servers or cores +# [[inputs.solr]] +# ## specify a list of one or more Solr servers +# servers = ["http://localhost:8983"] +# +# ## specify a list of one or more Solr cores (default - all) +# # cores = ["main"] +# +# ## Optional HTTP Basic Auth Credentials +# # username = "username" +# # password = "pa$$word" + + +# # Read metrics from Microsoft SQL Server +# [[inputs.sqlserver]] +# ## Specify instances to monitor with a list of connection strings. +# ## All connection parameters are optional. +# ## By default, the host is localhost, listening on default port, TCP 1433. +# ## for Windows, the user is the currently running AD user (SSO). +# ## See https://github.com/denisenkom/go-mssqldb for detailed connection +# ## parameters, in particular, tls connections can be created like so: +# ## "encrypt=true;certificate=;hostNameInCertificate=" +# # servers = [ +# # "Server=192.168.1.10;Port=1433;User Id=;Password=;app name=telegraf;log=1;", +# # ] +# +# ## Optional parameter, setting this to 2 will use a new version +# ## of the collection queries that break compatibility with the original +# ## dashboards. +# ## Version 2 - is compatible from SQL Server 2012 and later versions and also for SQL Azure DB +# query_version = 2 +# +# ## If you are using AzureDB, setting this to true will gather resource utilization metrics +# # azuredb = false +# +# ## Possible queries +# ## Version 2: +# ## - PerformanceCounters +# ## - WaitStatsCategorized +# ## - DatabaseIO +# ## - ServerProperties +# ## - MemoryClerk +# ## - Schedulers +# ## - SqlRequests +# ## - VolumeSpace +# ## - Cpu +# ## Version 1: +# ## - PerformanceCounters +# ## - WaitStatsCategorized +# ## - CPUHistory +# ## - DatabaseIO +# ## - DatabaseSize +# ## - DatabaseStats +# ## - DatabaseProperties +# ## - MemoryClerk +# ## - VolumeSpace +# ## - PerformanceMetrics +# +# ## A list of queries to include. If not specified, all the above listed queries are used. +# # include_query = [] +# +# ## A list of queries to explicitly ignore. +# exclude_query = [ 'Schedulers' , 'SqlRequests'] + + +# # Gather timeseries from Google Cloud Platform v3 monitoring API +# [[inputs.stackdriver]] +# ## GCP Project +# project = "erudite-bloom-151019" +# +# ## Include timeseries that start with the given metric type. +# metric_type_prefix_include = [ +# "compute.googleapis.com/", +# ] +# +# ## Exclude timeseries that start with the given metric type. +# # metric_type_prefix_exclude = [] +# +# ## Many metrics are updated once per minute; it is recommended to override +# ## the agent level interval with a value of 1m or greater. +# interval = "1m" +# +# ## Maximum number of API calls to make per second. The quota for accounts +# ## varies, it can be viewed on the API dashboard: +# ## https://cloud.google.com/monitoring/quotas#quotas_and_limits +# # rate_limit = 14 +# +# ## The delay and window options control the number of points selected on +# ## each gather. When set, metrics are gathered between: +# ## start: now() - delay - window +# ## end: now() - delay +# # +# ## Collection delay; if set too low metrics may not yet be available. +# # delay = "5m" +# # +# ## If unset, the window will start at 1m and be updated dynamically to span +# ## the time between calls (approximately the length of the plugin interval). +# # window = "1m" +# +# ## TTL for cached list of metric types. This is the maximum amount of time +# ## it may take to discover new metrics. +# # cache_ttl = "1h" +# +# ## If true, raw bucket counts are collected for distribution value types. +# ## For a more lightweight collection, you may wish to disable and use +# ## distribution_aggregation_aligners instead. +# # gather_raw_distribution_buckets = true +# +# ## Aggregate functions to be used for metrics whose value type is +# ## distribution. These aggregate values are recorded in in addition to raw +# ## bucket counts; if they are enabled. +# ## +# ## For a list of aligner strings see: +# ## https://cloud.google.com/monitoring/api/ref_v3/rpc/google.monitoring.v3#aligner +# # distribution_aggregation_aligners = [ +# # "ALIGN_PERCENTILE_99", +# # "ALIGN_PERCENTILE_95", +# # "ALIGN_PERCENTILE_50", +# # ] +# +# ## Filters can be added to reduce the number of time series matched. All +# ## functions are supported: starts_with, ends_with, has_substring, and +# ## one_of. Only the '=' operator is supported. +# ## +# ## The logical operators when combining filters are defined statically using +# ## the following values: +# ## filter ::= {AND } +# ## resource_labels ::= {OR } +# ## metric_labels ::= {OR } +# ## +# ## For more details, see https://cloud.google.com/monitoring/api/v3/filters +# # +# ## Resource labels refine the time series selection with the following expression: +# ## resource.labels. = +# # [[inputs.stackdriver.filter.resource_labels]] +# # key = "instance_name" +# # value = 'starts_with("localhost")' +# # +# ## Metric labels refine the time series selection with the following expression: +# ## metric.labels. = +# # [[inputs.stackdriver.filter.metric_labels]] +# # key = "device_name" +# # value = 'one_of("sda", "sdb")' + + +# # Get synproxy counter statistics from procfs +# [[inputs.synproxy]] +# # no configuration + + +# # Sysstat metrics collector +# [[inputs.sysstat]] +# ## Path to the sadc command. +# # +# ## Common Defaults: +# ## Debian/Ubuntu: /usr/lib/sysstat/sadc +# ## Arch: /usr/lib/sa/sadc +# ## RHEL/CentOS: /usr/lib64/sa/sadc +# sadc_path = "/usr/lib/sa/sadc" # required +# +# ## Path to the sadf command, if it is not in PATH +# # sadf_path = "/usr/bin/sadf" +# +# ## Activities is a list of activities, that are passed as argument to the +# ## sadc collector utility (e.g: DISK, SNMP etc...) +# ## The more activities that are added, the more data is collected. +# # activities = ["DISK"] +# +# ## Group metrics to measurements. +# ## +# ## If group is false each metric will be prefixed with a description +# ## and represents itself a measurement. +# ## +# ## If Group is true, corresponding metrics are grouped to a single measurement. +# # group = true +# +# ## Options for the sadf command. The values on the left represent the sadf +# ## options and the values on the right their description (which are used for +# ## grouping and prefixing metrics). +# ## +# ## Run 'sar -h' or 'man sar' to find out the supported options for your +# ## sysstat version. +# [inputs.sysstat.options] +# -C = "cpu" +# -B = "paging" +# -b = "io" +# -d = "disk" # requires DISK activity +# "-n ALL" = "network" +# "-P ALL" = "per_cpu" +# -q = "queue" +# -R = "mem" +# -r = "mem_util" +# -S = "swap_util" +# -u = "cpu_util" +# -v = "inode" +# -W = "swap" +# -w = "task" +# # -H = "hugepages" # only available for newer linux distributions +# # "-I ALL" = "interrupts" # requires INT activity +# +# ## Device tags can be used to add additional tags for devices. +# ## For example the configuration below adds a tag vg with value rootvg for +# ## all metrics with sda devices. +# # [[inputs.sysstat.device_tags.sda]] +# # vg = "rootvg" + + +# # Gather systemd units state +# [[inputs.systemd_units]] +# ## Set timeout for systemctl execution +# # timeout = "1s" +# # +# ## Filter for a specific unit type, default is "service", other possible +# ## values are "socket", "target", "device", "mount", "automount", "swap", +# ## "timer", "path", "slice" and "scope ": +# # unittype = "service" + + +# # Reads metrics from a Teamspeak 3 Server via ServerQuery +# [[inputs.teamspeak]] +# ## Server address for Teamspeak 3 ServerQuery +# # server = "127.0.0.1:10011" +# ## Username for ServerQuery +# username = "serverqueryuser" +# ## Password for ServerQuery +# password = "secret" +# ## Array of virtual servers +# # virtual_servers = [1] + + +# # Read metrics about temperature +# [[inputs.temp]] +# # no configuration + + +# # Read Tengine's basic status information (ngx_http_reqstat_module) +# [[inputs.tengine]] +# # An array of Tengine reqstat module URI to gather stats. +# urls = ["http://127.0.0.1/us"] +# +# # HTTP response timeout (default: 5s) +# # response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.cer" +# # tls_key = "/etc/telegraf/key.key" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Gather metrics from the Tomcat server status page. +# [[inputs.tomcat]] +# ## URL of the Tomcat server status +# # url = "http://127.0.0.1:8080/manager/status/all?XML=true" +# +# ## HTTP Basic Auth Credentials +# # username = "tomcat" +# # password = "s3cret" +# +# ## Request timeout +# # timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Inserts sine and cosine waves for demonstration purposes +# [[inputs.trig]] +# ## Set the amplitude +# amplitude = 10.0 + + +# # Read Twemproxy stats data +# [[inputs.twemproxy]] +# ## Twemproxy stats address and port (no scheme) +# addr = "localhost:22222" +# ## Monitor pool name +# pools = ["redis_pool", "mc_pool"] + + +# # A plugin to collect stats from the Unbound DNS resolver +# [[inputs.unbound]] +# ## Address of server to connect to, read from unbound conf default, optionally ':port' +# ## Will lookup IP if given a hostname +# server = "127.0.0.1:8953" +# +# ## If running as a restricted user you can prepend sudo for additional access: +# # use_sudo = false +# +# ## The default location of the unbound-control binary can be overridden with: +# # binary = "/usr/sbin/unbound-control" +# +# ## The default location of the unbound config file can be overridden with: +# # config_file = "/etc/unbound/unbound.conf" +# +# ## The default timeout of 1s can be overridden with: +# # timeout = "1s" +# +# ## When set to true, thread metrics are tagged with the thread id. +# ## +# ## The default is false for backwards compatibility, and will be changed to +# ## true in a future version. It is recommended to set to true on new +# ## deployments. +# thread_as_tag = false + + +# # Read uWSGI metrics. +# [[inputs.uwsgi]] +# ## List with urls of uWSGI Stats servers. URL must match pattern: +# ## scheme://address[:port] +# ## +# ## For example: +# ## servers = ["tcp://localhost:5050", "http://localhost:1717", "unix:///tmp/statsock"] +# servers = ["tcp://127.0.0.1:1717"] +# +# ## General connection timeout +# # timeout = "5s" + + +# # A plugin to collect stats from Varnish HTTP Cache +# [[inputs.varnish]] +# ## If running as a restricted user you can prepend sudo for additional access: +# #use_sudo = false +# +# ## The default location of the varnishstat binary can be overridden with: +# binary = "/usr/bin/varnishstat" +# +# ## By default, telegraf gather stats for 3 metric points. +# ## Setting stats will override the defaults shown below. +# ## Glob matching can be used, ie, stats = ["MAIN.*"] +# ## stats may also be set to ["*"], which will collect all stats +# stats = ["MAIN.cache_hit", "MAIN.cache_miss", "MAIN.uptime"] +# +# ## Optional name for the varnish instance (or working directory) to query +# ## Usually append after -n in varnish cli +# # instance_name = instanceName +# +# ## Timeout for varnishstat command +# # timeout = "1s" + + +# # Collect Wireguard server interface and peer statistics +# [[inputs.wireguard]] +# ## Optional list of Wireguard device/interface names to query. +# ## If omitted, all Wireguard interfaces are queried. +# # devices = ["wg0"] + + +# # Monitor wifi signal strength and quality +# [[inputs.wireless]] +# ## Sets 'proc' directory path +# ## If not specified, then default is /proc +# # host_proc = "/proc" + + +# # Reads metrics from a SSL certificate +# [[inputs.x509_cert]] +# ## List certificate sources +# sources = ["/etc/ssl/certs/ssl-cert-snakeoil.pem", "tcp://example.org:443"] +# +# ## Timeout for SSL connection +# # timeout = "5s" +# +# ## Pass a different name into the TLS request (Server Name Indication) +# ## example: server_name = "myhost.example.org" +# # server_name = "" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" + + +# # Read metrics of ZFS from arcstats, zfetchstats, vdev_cache_stats, and pools +# [[inputs.zfs]] +# ## ZFS kstat path. Ignored on FreeBSD +# ## If not specified, then default is: +# # kstatPath = "/proc/spl/kstat/zfs" +# +# ## By default, telegraf gather all zfs stats +# ## If not specified, then default is: +# # kstatMetrics = ["arcstats", "zfetchstats", "vdev_cache_stats"] +# ## For Linux, the default is: +# # kstatMetrics = ["abdstats", "arcstats", "dnodestats", "dbufcachestats", +# # "dmu_tx", "fm", "vdev_mirror_stats", "zfetchstats", "zil"] +# ## By default, don't gather zpool stats +# # poolMetrics = false + + +# # Reads 'mntr' stats from one or many zookeeper servers +# [[inputs.zookeeper]] +# ## An array of address to gather stats about. Specify an ip or hostname +# ## with port. ie localhost:2181, 10.0.0.1:2181, etc. +# +# ## If no servers are specified, then localhost is used as the host. +# ## If no port is specified, 2181 is used +# servers = [":2181"] +# +# ## Timeout for metric collections from all servers. Minimum timeout is "1s". +# # timeout = "5s" +# +# ## Optional TLS Config +# # enable_tls = true +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## If false, skip chain & host verification +# # insecure_skip_verify = true + + +############################################################################### +# SERVICE INPUT PLUGINS # +############################################################################### + + +# # AMQP consumer plugin +# [[inputs.amqp_consumer]] +# ## Broker to consume from. +# ## deprecated in 1.7; use the brokers option +# # url = "amqp://localhost:5672/influxdb" +# +# ## Brokers to consume from. If multiple brokers are specified a random broker +# ## will be selected anytime a connection is established. This can be +# ## helpful for load balancing when not using a dedicated load balancer. +# brokers = ["amqp://localhost:5672/influxdb"] +# +# ## Authentication credentials for the PLAIN auth_method. +# # username = "" +# # password = "" +# +# ## Name of the exchange to declare. If unset, no exchange will be declared. +# exchange = "telegraf" +# +# ## Exchange type; common types are "direct", "fanout", "topic", "header", "x-consistent-hash". +# # exchange_type = "topic" +# +# ## If true, exchange will be passively declared. +# # exchange_passive = false +# +# ## Exchange durability can be either "transient" or "durable". +# # exchange_durability = "durable" +# +# ## Additional exchange arguments. +# # exchange_arguments = { } +# # exchange_arguments = {"hash_property" = "timestamp"} +# +# ## AMQP queue name. +# queue = "telegraf" +# +# ## AMQP queue durability can be "transient" or "durable". +# queue_durability = "durable" +# +# ## If true, queue will be passively declared. +# # queue_passive = false +# +# ## A binding between the exchange and queue using this binding key is +# ## created. If unset, no binding is created. +# binding_key = "#" +# +# ## Maximum number of messages server should give to the worker. +# # prefetch_count = 50 +# +# ## Maximum messages to read from the broker that have not been written by an +# ## output. For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message from the queue contains 10 metrics and the +# ## output metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Auth method. PLAIN and EXTERNAL are supported +# ## Using EXTERNAL requires enabling the rabbitmq_auth_mechanism_ssl plugin as +# ## described here: https://www.rabbitmq.com/plugins.html +# # auth_method = "PLAIN" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Content encoding for message payloads, can be set to "gzip" to or +# ## "identity" to apply no encoding. +# # content_encoding = "identity" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read Cassandra metrics through Jolokia +# [[inputs.cassandra]] +# ## DEPRECATED: The cassandra plugin has been deprecated. Please use the +# ## jolokia2 plugin instead. +# ## +# ## see https://github.com/influxdata/telegraf/tree/master/plugins/inputs/jolokia2 +# +# context = "/jolokia/read" +# ## List of cassandra servers exposing jolokia read service +# servers = ["myuser:mypassword@10.10.10.1:8778","10.10.10.2:8778",":8778"] +# ## List of metrics collected on above servers +# ## Each metric consists of a jmx path. +# ## This will collect all heap memory usage metrics from the jvm and +# ## ReadLatency metrics for all keyspaces and tables. +# ## "type=Table" in the query works with Cassandra3.0. Older versions might +# ## need to use "type=ColumnFamily" +# metrics = [ +# "/java.lang:type=Memory/HeapMemoryUsage", +# "/org.apache.cassandra.metrics:type=Table,keyspace=*,scope=*,name=ReadLatency" +# ] + + +# # Cisco model-driven telemetry (MDT) input plugin for IOS XR, IOS XE and NX-OS platforms +# [[inputs.cisco_telemetry_mdt]] +# ## Telemetry transport can be "tcp" or "grpc". TLS is only supported when +# ## using the grpc transport. +# transport = "grpc" +# +# ## Address and port to host telemetry listener +# service_address = ":57000" +# +# ## Enable TLS; grpc transport only. +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## Enable TLS client authentication and define allowed CA certificates; grpc +# ## transport only. +# # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Define (for certain nested telemetry measurements with embedded tags) which fields are tags +# # embedded_tags = ["Cisco-IOS-XR-qos-ma-oper:qos/interface-table/interface/input/service-policy-names/service-policy-instance/statistics/class-stats/class-name"] +# +# ## Define aliases to map telemetry encoding paths to simple measurement names +# [inputs.cisco_telemetry_mdt.aliases] +# ifstats = "ietf-interfaces:interfaces-state/interface/statistics" + + +# # Read metrics from one or many ClickHouse servers +# [[inputs.clickhouse]] +# ## Username for authorization on ClickHouse server +# ## example: user = "default"" +# username = "default" +# +# ## Password for authorization on ClickHouse server +# ## example: password = "super_secret" +# +# ## HTTP(s) timeout while getting metrics values +# ## The timeout includes connection time, any redirects, and reading the response body. +# ## example: timeout = 1s +# # timeout = 5s +# +# ## List of servers for metrics scraping +# ## metrics scrape via HTTP(s) clickhouse interface +# ## https://clickhouse.tech/docs/en/interfaces/http/ +# ## example: servers = ["http://127.0.0.1:8123","https://custom-server.mdb.yandexcloud.net"] +# servers = ["http://127.0.0.1:8123"] +# +# ## If "auto_discovery"" is "true" plugin tries to connect to all servers available in the cluster +# ## with using same "user:password" described in "user" and "password" parameters +# ## and get this server hostname list from "system.clusters" table +# ## see +# ## - https://clickhouse.tech/docs/en/operations/system_tables/#system-clusters +# ## - https://clickhouse.tech/docs/en/operations/server_settings/settings/#server_settings_remote_servers +# ## - https://clickhouse.tech/docs/en/operations/table_engines/distributed/ +# ## - https://clickhouse.tech/docs/en/operations/table_engines/replication/#creating-replicated-tables +# ## example: auto_discovery = false +# # auto_discovery = true +# +# ## Filter cluster names in "system.clusters" when "auto_discovery" is "true" +# ## when this filter present then "WHERE cluster IN (...)" filter will apply +# ## please use only full cluster names here, regexp and glob filters is not allowed +# ## for "/etc/clickhouse-server/config.d/remote.xml" +# ## +# ## +# ## +# ## +# ## clickhouse-ru-1.local9000 +# ## clickhouse-ru-2.local9000 +# ## +# ## +# ## clickhouse-eu-1.local9000 +# ## clickhouse-eu-2.local9000 +# ## +# ## +# ## +# ## +# ## +# ## +# ## example: cluster_include = ["my-own-cluster"] +# # cluster_include = [] +# +# ## Filter cluster names in "system.clusters" when "auto_discovery" is "true" +# ## when this filter present then "WHERE cluster NOT IN (...)" filter will apply +# ## example: cluster_exclude = ["my-internal-not-discovered-cluster"] +# # cluster_exclude = [] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics from Google PubSub +# [[inputs.cloud_pubsub]] +# ## Required. Name of Google Cloud Platform (GCP) Project that owns +# ## the given PubSub subscription. +# project = "my-project" +# +# ## Required. Name of PubSub subscription to ingest metrics from. +# subscription = "my-subscription" +# +# ## Required. Data format to consume. +# ## Each data format has its own unique set of configuration options. +# ## Read more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" +# +# ## Optional. Filepath for GCP credentials JSON file to authorize calls to +# ## PubSub APIs. If not set explicitly, Telegraf will attempt to use +# ## Application Default Credentials, which is preferred. +# # credentials_file = "path/to/my/creds.json" +# +# ## Optional. Number of seconds to wait before attempting to restart the +# ## PubSub subscription receiver after an unexpected error. +# ## If the streaming pull for a PubSub Subscription fails (receiver), +# ## the agent attempts to restart receiving messages after this many seconds. +# # retry_delay_seconds = 5 +# +# ## Optional. Maximum byte length of a message to consume. +# ## Larger messages are dropped with an error. If less than 0 or unspecified, +# ## treated as no limit. +# # max_message_len = 1000000 +# +# ## Optional. Maximum messages to read from PubSub that have not been written +# ## to an output. Defaults to 1000. +# ## For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message contains 10 metrics and the output +# ## metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## The following are optional Subscription ReceiveSettings in PubSub. +# ## Read more about these values: +# ## https://godoc.org/cloud.google.com/go/pubsub#ReceiveSettings +# +# ## Optional. Maximum number of seconds for which a PubSub subscription +# ## should auto-extend the PubSub ACK deadline for each message. If less than +# ## 0, auto-extension is disabled. +# # max_extension = 0 +# +# ## Optional. Maximum number of unprocessed messages in PubSub +# ## (unacknowledged but not yet expired in PubSub). +# ## A value of 0 is treated as the default PubSub value. +# ## Negative values will be treated as unlimited. +# # max_outstanding_messages = 0 +# +# ## Optional. Maximum size in bytes of unprocessed messages in PubSub +# ## (unacknowledged but not yet expired in PubSub). +# ## A value of 0 is treated as the default PubSub value. +# ## Negative values will be treated as unlimited. +# # max_outstanding_bytes = 0 +# +# ## Optional. Max number of goroutines a PubSub Subscription receiver can spawn +# ## to pull messages from PubSub concurrently. This limit applies to each +# ## subscription separately and is treated as the PubSub default if less than +# ## 1. Note this setting does not limit the number of messages that can be +# ## processed concurrently (use "max_outstanding_messages" instead). +# # max_receiver_go_routines = 0 +# +# ## Optional. If true, Telegraf will attempt to base64 decode the +# ## PubSub message data before parsing +# # base64_data = false + + +# # Google Cloud Pub/Sub Push HTTP listener +# [[inputs.cloud_pubsub_push]] +# ## Address and port to host HTTP listener on +# service_address = ":8080" +# +# ## Application secret to verify messages originate from Cloud Pub/Sub +# # token = "" +# +# ## Path to listen to. +# # path = "/" +# +# ## Maximum duration before timing out read of the request +# # read_timeout = "10s" +# ## Maximum duration before timing out write of the response. This should be set to a value +# ## large enough that you can send at least 'metric_batch_size' number of messages within the +# ## duration. +# # write_timeout = "10s" +# +# ## Maximum allowed http request body size in bytes. +# ## 0 means to use the default of 524,288,00 bytes (500 mebibytes) +# # max_body_size = "500MB" +# +# ## Whether to add the pubsub metadata, such as message attributes and subscription as a tag. +# # add_meta = false +# +# ## Optional. Maximum messages to read from PubSub that have not been written +# ## to an output. Defaults to 1000. +# ## For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message contains 10 metrics and the output +# ## metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Set one or more allowed client CA certificate file names to +# ## enable mutually authenticated TLS connections +# # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Add service certificate and key +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read logging output from the Docker engine +# [[inputs.docker_log]] +# ## Docker Endpoint +# ## To use TCP, set endpoint = "tcp://[ip]:[port]" +# ## To use environment variables (ie, docker-machine), set endpoint = "ENV" +# # endpoint = "unix:///var/run/docker.sock" +# +# ## When true, container logs are read from the beginning; otherwise +# ## reading begins at the end of the log. +# # from_beginning = false +# +# ## Timeout for Docker API calls. +# # timeout = "5s" +# +# ## Containers to include and exclude. Globs accepted. +# ## Note that an empty array for both will include all containers +# # container_name_include = [] +# # container_name_exclude = [] +# +# ## Container states to include and exclude. Globs accepted. +# ## When empty only containers in the "running" state will be captured. +# # container_state_include = [] +# # container_state_exclude = [] +# +# ## docker labels to include and exclude as tags. Globs accepted. +# ## Note that an empty array for both will include all labels as tags +# # docker_label_include = [] +# # docker_label_exclude = [] +# +# ## Set the source tag for the metrics to the container ID hostname, eg first 12 chars +# source_tag = false +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Azure Event Hubs service input plugin +# [[inputs.eventhub_consumer]] +# ## The default behavior is to create a new Event Hub client from environment variables. +# ## This requires one of the following sets of environment variables to be set: +# ## +# ## 1) Expected Environment Variables: +# ## - "EVENTHUB_NAMESPACE" +# ## - "EVENTHUB_NAME" +# ## - "EVENTHUB_CONNECTION_STRING" +# ## +# ## 2) Expected Environment Variables: +# ## - "EVENTHUB_NAMESPACE" +# ## - "EVENTHUB_NAME" +# ## - "EVENTHUB_KEY_NAME" +# ## - "EVENTHUB_KEY_VALUE" +# +# ## Uncommenting the option below will create an Event Hub client based solely on the connection string. +# ## This can either be the associated environment variable or hard coded directly. +# # connection_string = "" +# +# ## Set persistence directory to a valid folder to use a file persister instead of an in-memory persister +# # persistence_dir = "" +# +# ## Change the default consumer group +# # consumer_group = "" +# +# ## By default the event hub receives all messages present on the broker, alternative modes can be set below. +# ## The timestamp should be in https://github.com/toml-lang/toml#offset-date-time format (RFC 3339). +# ## The 3 options below only apply if no valid persister is read from memory or file (e.g. first run). +# # from_timestamp = +# # latest = true +# +# ## Set a custom prefetch count for the receiver(s) +# # prefetch_count = 1000 +# +# ## Add an epoch to the receiver(s) +# # epoch = 0 +# +# ## Change to set a custom user agent, "telegraf" is used by default +# # user_agent = "telegraf" +# +# ## To consume from a specific partition, set the partition_ids option. +# ## An empty array will result in receiving from all partitions. +# # partition_ids = ["0","1"] +# +# ## Max undelivered messages +# # max_undelivered_messages = 1000 +# +# ## Set either option below to true to use a system property as timestamp. +# ## You have the choice between EnqueuedTime and IoTHubEnqueuedTime. +# ## It is recommended to use this setting when the data itself has no timestamp. +# # enqueued_time_as_ts = true +# # iot_hub_enqueued_time_as_ts = true +# +# ## Tags or fields to create from keys present in the application property bag. +# ## These could for example be set by message enrichments in Azure IoT Hub. +# # application_property_tags = [] +# # application_property_fields = [] +# +# ## Tag or field name to use for metadata +# ## By default all metadata is disabled +# # sequence_number_field = "SequenceNumber" +# # enqueued_time_field = "EnqueuedTime" +# # offset_field = "Offset" +# # partition_id_tag = "PartitionID" +# # partition_key_tag = "PartitionKey" +# # iot_hub_device_connection_id_tag = "IoTHubDeviceConnectionID" +# # iot_hub_auth_generation_id_tag = "IoTHubAuthGenerationID" +# # iot_hub_connection_auth_method_tag = "IoTHubConnectionAuthMethod" +# # iot_hub_connection_module_id_tag = "IoTHubConnectionModuleID" +# # iot_hub_enqueued_time_field = "IoTHubEnqueuedTime" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Run executable as long-running input plugin +# [[inputs.execd]] +# ## Program to run as daemon +# command = ["telegraf-smartctl", "-d", "/dev/sda"] +# +# ## Define how the process is signaled on each collection interval. +# ## Valid values are: +# ## "none" : Do not signal anything. +# ## The process must output metrics by itself. +# ## "STDIN" : Send a newline on STDIN. +# ## "SIGHUP" : Send a HUP signal. Not available on Windows. +# ## "SIGUSR1" : Send a USR1 signal. Not available on Windows. +# ## "SIGUSR2" : Send a USR2 signal. Not available on Windows. +# signal = "none" +# +# ## Delay before the process is restarted after an unexpected termination +# restart_delay = "10s" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # gNMI telemetry input plugin +# [[inputs.gnmi]] +# ## Address and port of the gNMI GRPC server +# addresses = ["10.49.234.114:57777"] +# +# ## define credentials +# username = "cisco" +# password = "cisco" +# +# ## gNMI encoding requested (one of: "proto", "json", "json_ietf") +# # encoding = "proto" +# +# ## redial in case of failures after +# redial = "10s" +# +# ## enable client-side TLS and define CA to authenticate the device +# # enable_tls = true +# # tls_ca = "/etc/telegraf/ca.pem" +# # insecure_skip_verify = true +# +# ## define client-side TLS certificate & key to authenticate to the device +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## gNMI subscription prefix (optional, can usually be left empty) +# ## See: https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-specification.md#222-paths +# # origin = "" +# # prefix = "" +# # target = "" +# +# ## Define additional aliases to map telemetry encoding paths to simple measurement names +# #[inputs.gnmi.aliases] +# # ifcounters = "openconfig:/interfaces/interface/state/counters" +# +# [[inputs.gnmi.subscription]] +# ## Name of the measurement that will be emitted +# name = "ifcounters" +# +# ## Origin and path of the subscription +# ## See: https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-specification.md#222-paths +# ## +# ## origin usually refers to a (YANG) data model implemented by the device +# ## and path to a specific substructure inside it that should be subscribed to (similar to an XPath) +# ## YANG models can be found e.g. here: https://github.com/YangModels/yang/tree/master/vendor/cisco/xr +# origin = "openconfig-interfaces" +# path = "/interfaces/interface/state/counters" +# +# # Subscription mode (one of: "target_defined", "sample", "on_change") and interval +# subscription_mode = "sample" +# sample_interval = "10s" +# +# ## Suppress redundant transmissions when measured values are unchanged +# # suppress_redundant = false +# +# ## If suppression is enabled, send updates at least every X seconds anyway +# # heartbeat_interval = "60s" + + +# # Accept metrics over InfluxDB 1.x HTTP API +# [[inputs.http_listener]] +# ## Address and port to host InfluxDB listener on +# service_address = ":8186" +# +# ## maximum duration before timing out read of the request +# read_timeout = "10s" +# ## maximum duration before timing out write of the response +# write_timeout = "10s" +# +# ## Maximum allowed HTTP request body size in bytes. +# ## 0 means to use the default of 32MiB. +# max_body_size = "32MiB" +# +# ## Optional tag name used to store the database. +# ## If the write has a database in the query string then it will be kept in this tag name. +# ## This tag can be used in downstream outputs. +# ## The default value of nothing means it will be off and the database will not be recorded. +# # database_tag = "" +# +# ## If set the retention policy specified in the write query will be added as +# ## the value of this tag name. +# # retention_policy_tag = "" +# +# ## Set one or more allowed client CA certificate file names to +# ## enable mutually authenticated TLS connections +# tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Add service certificate and key +# tls_cert = "/etc/telegraf/cert.pem" +# tls_key = "/etc/telegraf/key.pem" +# +# ## Optional username and password to accept for HTTP basic authentication. +# ## You probably want to make sure you have TLS configured above for this. +# # basic_username = "foobar" +# # basic_password = "barfoo" + + +# # Generic HTTP write listener +# [[inputs.http_listener_v2]] +# ## Address and port to host HTTP listener on +# service_address = ":8080" +# +# ## Path to listen to. +# # path = "/telegraf" +# +# ## HTTP methods to accept. +# # methods = ["POST", "PUT"] +# +# ## maximum duration before timing out read of the request +# # read_timeout = "10s" +# ## maximum duration before timing out write of the response +# # write_timeout = "10s" +# +# ## Maximum allowed http request body size in bytes. +# ## 0 means to use the default of 524,288,00 bytes (500 mebibytes) +# # max_body_size = "500MB" +# +# ## Part of the request to consume. Available options are "body" and +# ## "query". +# # data_source = "body" +# +# ## Set one or more allowed client CA certificate file names to +# ## enable mutually authenticated TLS connections +# # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Add service certificate and key +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## Optional username and password to accept for HTTP basic authentication. +# ## You probably want to make sure you have TLS configured above for this. +# # basic_username = "foobar" +# # basic_password = "barfoo" +# +# ## Optional setting to map http headers into tags +# ## If the http header is not present on the request, no corresponding tag will be added +# ## If multiple instances of the http header are present, only the first value will be used +# # http_header_tags = {"HTTP_HEADER" = "TAG_NAME"} +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Accept metrics over InfluxDB 1.x HTTP API +# [[inputs.influxdb_listener]] +# ## Address and port to host InfluxDB listener on +# service_address = ":8186" +# +# ## maximum duration before timing out read of the request +# read_timeout = "10s" +# ## maximum duration before timing out write of the response +# write_timeout = "10s" +# +# ## Maximum allowed HTTP request body size in bytes. +# ## 0 means to use the default of 32MiB. +# max_body_size = "32MiB" +# +# ## Optional tag name used to store the database. +# ## If the write has a database in the query string then it will be kept in this tag name. +# ## This tag can be used in downstream outputs. +# ## The default value of nothing means it will be off and the database will not be recorded. +# # database_tag = "" +# +# ## If set the retention policy specified in the write query will be added as +# ## the value of this tag name. +# # retention_policy_tag = "" +# +# ## Set one or more allowed client CA certificate file names to +# ## enable mutually authenticated TLS connections +# tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Add service certificate and key +# tls_cert = "/etc/telegraf/cert.pem" +# tls_key = "/etc/telegraf/key.pem" +# +# ## Optional username and password to accept for HTTP basic authentication. +# ## You probably want to make sure you have TLS configured above for this. +# # basic_username = "foobar" +# # basic_password = "barfoo" + + +# # Read JTI OpenConfig Telemetry from listed sensors +# [[inputs.jti_openconfig_telemetry]] +# ## List of device addresses to collect telemetry from +# servers = ["localhost:1883"] +# +# ## Authentication details. Username and password are must if device expects +# ## authentication. Client ID must be unique when connecting from multiple instances +# ## of telegraf to the same device +# username = "user" +# password = "pass" +# client_id = "telegraf" +# +# ## Frequency to get data +# sample_frequency = "1000ms" +# +# ## Sensors to subscribe for +# ## A identifier for each sensor can be provided in path by separating with space +# ## Else sensor path will be used as identifier +# ## When identifier is used, we can provide a list of space separated sensors. +# ## A single subscription will be created with all these sensors and data will +# ## be saved to measurement with this identifier name +# sensors = [ +# "/interfaces/", +# "collection /components/ /lldp", +# ] +# +# ## We allow specifying sensor group level reporting rate. To do this, specify the +# ## reporting rate in Duration at the beginning of sensor paths / collection +# ## name. For entries without reporting rate, we use configured sample frequency +# sensors = [ +# "1000ms customReporting /interfaces /lldp", +# "2000ms collection /components", +# "/interfaces", +# ] +# +# ## Optional TLS Config +# # enable_tls = true +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Delay between retry attempts of failed RPC calls or streams. Defaults to 1000ms. +# ## Failed streams/calls will not be retried if 0 is provided +# retry_delay = "1000ms" +# +# ## To treat all string values as tags, set this to true +# str_as_tags = false + + +# # Read metrics from Kafka topics +# [[inputs.kafka_consumer]] +# ## Kafka brokers. +# brokers = ["localhost:9092"] +# +# ## Topics to consume. +# topics = ["telegraf"] +# +# ## When set this tag will be added to all metrics with the topic as the value. +# # topic_tag = "" +# +# ## Optional Client id +# # client_id = "Telegraf" +# +# ## Set the minimal supported Kafka version. Setting this enables the use of new +# ## Kafka features and APIs. Must be 0.10.2.0 or greater. +# ## ex: version = "1.1.0" +# # version = "" +# +# ## Optional TLS Config +# # enable_tls = true +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## SASL authentication credentials. These settings should typically be used +# ## with TLS encryption enabled using the "enable_tls" option. +# # sasl_username = "kafka" +# # sasl_password = "secret" +# +# ## SASL protocol version. When connecting to Azure EventHub set to 0. +# # sasl_version = 1 +# +# ## Name of the consumer group. +# # consumer_group = "telegraf_metrics_consumers" +# +# ## Initial offset position; one of "oldest" or "newest". +# # offset = "oldest" +# +# ## Consumer group partition assignment strategy; one of "range", "roundrobin" or "sticky". +# # balance_strategy = "range" +# +# ## Maximum length of a message to consume, in bytes (default 0/unlimited); +# ## larger messages are dropped +# max_message_len = 1000000 +# +# ## Maximum messages to read from the broker that have not been written by an +# ## output. For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message from the queue contains 10 metrics and the +# ## output metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read metrics from Kafka topic(s) +# [[inputs.kafka_consumer_legacy]] +# ## topic(s) to consume +# topics = ["telegraf"] +# +# ## an array of Zookeeper connection strings +# zookeeper_peers = ["localhost:2181"] +# +# ## Zookeeper Chroot +# zookeeper_chroot = "" +# +# ## the name of the consumer group +# consumer_group = "telegraf_metrics_consumers" +# +# ## Offset (must be either "oldest" or "newest") +# offset = "oldest" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" +# +# ## Maximum length of a message to consume, in bytes (default 0/unlimited); +# ## larger messages are dropped +# max_message_len = 65536 + + +# # Configuration for the AWS Kinesis input. +# [[inputs.kinesis_consumer]] +# ## Amazon REGION of kinesis endpoint. +# region = "ap-southeast-2" +# +# ## Amazon Credentials +# ## Credentials are loaded in the following order +# ## 1) Assumed credentials via STS if role_arn is specified +# ## 2) explicit credentials from 'access_key' and 'secret_key' +# ## 3) shared profile from 'profile' +# ## 4) environment variables +# ## 5) shared credentials file +# ## 6) EC2 Instance Profile +# # access_key = "" +# # secret_key = "" +# # token = "" +# # role_arn = "" +# # profile = "" +# # shared_credential_file = "" +# +# ## Endpoint to make request against, the correct endpoint is automatically +# ## determined and this option should only be set if you wish to override the +# ## default. +# ## ex: endpoint_url = "http://localhost:8000" +# # endpoint_url = "" +# +# ## Kinesis StreamName must exist prior to starting telegraf. +# streamname = "StreamName" +# +# ## Shard iterator type (only 'TRIM_HORIZON' and 'LATEST' currently supported) +# # shard_iterator_type = "TRIM_HORIZON" +# +# ## Maximum messages to read from the broker that have not been written by an +# ## output. For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message from the queue contains 10 metrics and the +# ## output metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" +# +# ## Optional +# ## Configuration for a dynamodb checkpoint +# [inputs.kinesis_consumer.checkpoint_dynamodb] +# ## unique name for this consumer +# app_name = "default" +# table_name = "default" + + +# # Read metrics off Arista LANZ, via socket +# [[inputs.lanz]] +# ## URL to Arista LANZ endpoint +# servers = [ +# "tcp://127.0.0.1:50001" +# ] + + +# # Stream and parse log file(s). +# [[inputs.logparser]] +# ## Log files to parse. +# ## These accept standard unix glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## /var/log/**.log -> recursively find all .log files in /var/log +# ## /var/log/*/*.log -> find all .log files with a parent dir in /var/log +# ## /var/log/apache.log -> only tail the apache log file +# files = ["/var/log/apache/access.log"] +# +# ## Read files that currently exist from the beginning. Files that are created +# ## while telegraf is running (and that match the "files" globs) will always +# ## be read from the beginning. +# from_beginning = false +# +# ## Method used to watch for file updates. Can be either "inotify" or "poll". +# # watch_method = "inotify" +# +# ## Parse logstash-style "grok" patterns: +# [inputs.logparser.grok] +# ## This is a list of patterns to check the given log file(s) for. +# ## Note that adding patterns here increases processing time. The most +# ## efficient configuration is to have one pattern per logparser. +# ## Other common built-in patterns are: +# ## %{COMMON_LOG_FORMAT} (plain apache & nginx access logs) +# ## %{COMBINED_LOG_FORMAT} (access logs + referrer & agent) +# patterns = ["%{COMBINED_LOG_FORMAT}"] +# +# ## Name of the outputted measurement name. +# measurement = "apache_access_log" +# +# ## Full path(s) to custom pattern files. +# custom_pattern_files = [] +# +# ## Custom patterns can also be defined here. Put one pattern per line. +# custom_patterns = ''' +# ''' +# +# ## Timezone allows you to provide an override for timestamps that +# ## don't already include an offset +# ## e.g. 04/06/2016 12:41:45 data one two 5.43µs +# ## +# ## Default: "" which renders UTC +# ## Options are as follows: +# ## 1. Local -- interpret based on machine localtime +# ## 2. "Canada/Eastern" -- Unix TZ values like those found in https://en.wikipedia.org/wiki/List_of_tz_database_time_zones +# ## 3. UTC -- or blank/unspecified, will return timestamp in UTC +# # timezone = "Canada/Eastern" +# +# ## When set to "disable", timestamp will not incremented if there is a +# ## duplicate. +# # unique_timestamp = "auto" + + +# # Read metrics from MQTT topic(s) +# [[inputs.mqtt_consumer]] +# ## Broker URLs for the MQTT server or cluster. To connect to multiple +# ## clusters or standalone servers, use a seperate plugin instance. +# ## example: servers = ["tcp://localhost:1883"] +# ## servers = ["ssl://localhost:1883"] +# ## servers = ["ws://localhost:1883"] +# servers = ["tcp://127.0.0.1:1883"] +# +# ## Topics that will be subscribed to. +# topics = [ +# "telegraf/host01/cpu", +# "telegraf/+/mem", +# "sensors/#", +# ] +# +# ## The message topic will be stored in a tag specified by this value. If set +# ## to the empty string no topic tag will be created. +# # topic_tag = "topic" +# +# ## QoS policy for messages +# ## 0 = at most once +# ## 1 = at least once +# ## 2 = exactly once +# ## +# ## When using a QoS of 1 or 2, you should enable persistent_session to allow +# ## resuming unacknowledged messages. +# # qos = 0 +# +# ## Connection timeout for initial connection in seconds +# # connection_timeout = "30s" +# +# ## Maximum messages to read from the broker that have not been written by an +# ## output. For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message from the queue contains 10 metrics and the +# ## output metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Persistent session disables clearing of the client session on connection. +# ## In order for this option to work you must also set client_id to identify +# ## the client. To receive messages that arrived while the client is offline, +# ## also set the qos option to 1 or 2 and don't forget to also set the QoS when +# ## publishing. +# # persistent_session = false +# +# ## If unset, a random client ID will be generated. +# # client_id = "" +# +# ## Username and password to connect MQTT server. +# # username = "telegraf" +# # password = "metricsmetricsmetricsmetrics" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read metrics from NATS subject(s) +# [[inputs.nats_consumer]] +# ## urls of NATS servers +# servers = ["nats://localhost:4222"] +# +# ## subject(s) to consume +# subjects = ["telegraf"] +# +# ## name a queue group +# queue_group = "telegraf_consumers" +# +# ## Optional credentials +# # username = "" +# # password = "" +# +# ## Optional NATS 2.0 and NATS NGS compatible user credentials +# # credentials = "/etc/telegraf/nats.creds" +# +# ## Use Transport Layer Security +# # secure = false +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Sets the limits for pending msgs and bytes for each subscription +# ## These shouldn't need to be adjusted except in very high throughput scenarios +# # pending_message_limit = 65536 +# # pending_bytes_limit = 67108864 +# +# ## Maximum messages to read from the broker that have not been written by an +# ## output. For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message from the queue contains 10 metrics and the +# ## output metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read NSQ topic for metrics. +# [[inputs.nsq_consumer]] +# ## Server option still works but is deprecated, we just prepend it to the nsqd array. +# # server = "localhost:4150" +# +# ## An array representing the NSQD TCP HTTP Endpoints +# nsqd = ["localhost:4150"] +# +# ## An array representing the NSQLookupd HTTP Endpoints +# nsqlookupd = ["localhost:4161"] +# topic = "telegraf" +# channel = "consumer" +# max_in_flight = 100 +# +# ## Maximum messages to read from the broker that have not been written by an +# ## output. For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message from the queue contains 10 metrics and the +# ## output metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read metrics from one or many pgbouncer servers +# [[inputs.pgbouncer]] +# ## specify address via a url matching: +# ## postgres://[pqgotest[:password]]@localhost[/dbname]\ +# ## ?sslmode=[disable|verify-ca|verify-full] +# ## or a simple string: +# ## host=localhost user=pqgotest password=... sslmode=... dbname=app_production +# ## +# ## All connection parameters are optional. +# ## +# address = "host=localhost user=pgbouncer sslmode=disable" + + +# # Read metrics from one or many postgresql servers +# [[inputs.postgresql]] +# ## specify address via a url matching: +# ## postgres://[pqgotest[:password]]@localhost[/dbname]\ +# ## ?sslmode=[disable|verify-ca|verify-full] +# ## or a simple string: +# ## host=localhost user=pqgotest password=... sslmode=... dbname=app_production +# ## +# ## All connection parameters are optional. +# ## +# ## Without the dbname parameter, the driver will default to a database +# ## with the same name as the user. This dbname is just for instantiating a +# ## connection with the server and doesn't restrict the databases we are trying +# ## to grab metrics for. +# ## +# address = "host=localhost user=postgres sslmode=disable" +# ## A custom name for the database that will be used as the "server" tag in the +# ## measurement output. If not specified, a default one generated from +# ## the connection address is used. +# # outputaddress = "db01" +# +# ## connection configuration. +# ## maxlifetime - specify the maximum lifetime of a connection. +# ## default is forever (0s) +# max_lifetime = "0s" +# +# ## A list of databases to explicitly ignore. If not specified, metrics for all +# ## databases are gathered. Do NOT use with the 'databases' option. +# # ignored_databases = ["postgres", "template0", "template1"] +# +# ## A list of databases to pull metrics about. If not specified, metrics for all +# ## databases are gathered. Do NOT use with the 'ignored_databases' option. +# # databases = ["app_production", "testing"] + + +# # Read metrics from one or many postgresql servers +# [[inputs.postgresql_extensible]] +# ## specify address via a url matching: +# ## postgres://[pqgotest[:password]]@localhost[/dbname]\ +# ## ?sslmode=[disable|verify-ca|verify-full] +# ## or a simple string: +# ## host=localhost user=pqgotest password=... sslmode=... dbname=app_production +# # +# ## All connection parameters are optional. # +# ## Without the dbname parameter, the driver will default to a database +# ## with the same name as the user. This dbname is just for instantiating a +# ## connection with the server and doesn't restrict the databases we are trying +# ## to grab metrics for. +# # +# address = "host=localhost user=postgres sslmode=disable" +# +# ## connection configuration. +# ## maxlifetime - specify the maximum lifetime of a connection. +# ## default is forever (0s) +# max_lifetime = "0s" +# +# ## A list of databases to pull metrics about. If not specified, metrics for all +# ## databases are gathered. +# ## databases = ["app_production", "testing"] +# # +# ## A custom name for the database that will be used as the "server" tag in the +# ## measurement output. If not specified, a default one generated from +# ## the connection address is used. +# # outputaddress = "db01" +# # +# ## Define the toml config where the sql queries are stored +# ## New queries can be added, if the withdbname is set to true and there is no +# ## databases defined in the 'databases field', the sql query is ended by a +# ## 'is not null' in order to make the query succeed. +# ## Example : +# ## The sqlquery : "SELECT * FROM pg_stat_database where datname" become +# ## "SELECT * FROM pg_stat_database where datname IN ('postgres', 'pgbench')" +# ## because the databases variable was set to ['postgres', 'pgbench' ] and the +# ## withdbname was true. Be careful that if the withdbname is set to false you +# ## don't have to define the where clause (aka with the dbname) the tagvalue +# ## field is used to define custom tags (separated by commas) +# ## The optional "measurement" value can be used to override the default +# ## output measurement name ("postgresql"). +# ## +# ## The script option can be used to specify the .sql file path. +# ## If script and sqlquery options specified at same time, sqlquery will be used +# ## +# ## Structure : +# ## [[inputs.postgresql_extensible.query]] +# ## sqlquery string +# ## version string +# ## withdbname boolean +# ## tagvalue string (comma separated) +# ## measurement string +# [[inputs.postgresql_extensible.query]] +# sqlquery="SELECT * FROM pg_stat_database" +# version=901 +# withdbname=false +# tagvalue="" +# measurement="" +# [[inputs.postgresql_extensible.query]] +# sqlquery="SELECT * FROM pg_stat_bgwriter" +# version=901 +# withdbname=false +# tagvalue="postgresql.stats" + + +# # Read metrics from one or many prometheus clients +# [[inputs.prometheus]] +# ## An array of urls to scrape metrics from. +# urls = ["http://localhost:9100/metrics"] +# +# ## Metric version controls the mapping from Prometheus metrics into +# ## Telegraf metrics. When using the prometheus_client output, use the same +# ## value in both plugins to ensure metrics are round-tripped without +# ## modification. +# ## +# ## example: metric_version = 1; deprecated in 1.13 +# ## metric_version = 2; recommended version +# # metric_version = 1 +# +# ## Url tag name (tag containing scrapped url. optional, default is "url") +# # url_tag = "scrapeUrl" +# +# ## An array of Kubernetes services to scrape metrics from. +# # kubernetes_services = ["http://my-service-dns.my-namespace:9100/metrics"] +# +# ## Kubernetes config file to create client from. +# # kube_config = "/path/to/kubernetes.config" +# +# ## Scrape Kubernetes pods for the following prometheus annotations: +# ## - prometheus.io/scrape: Enable scraping for this pod +# ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to +# ## set this to 'https' & most likely set the tls config. +# ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. +# ## - prometheus.io/port: If port is not 9102 use this annotation +# # monitor_kubernetes_pods = true +# ## Restricts Kubernetes monitoring to a single namespace +# ## ex: monitor_kubernetes_pods_namespace = "default" +# # monitor_kubernetes_pods_namespace = "" +# # label selector to target pods which have the label +# # kubernetes_label_selector = "env=dev,app=nginx" +# # field selector to target pods +# # eg. To scrape pods on a specific node +# # kubernetes_field_selector = "spec.nodeName=$HOSTNAME" +# +# ## Use bearer token for authorization. ('bearer_token' takes priority) +# # bearer_token = "/path/to/bearer/token" +# ## OR +# # bearer_token_string = "abc_123" +# +# ## HTTP Basic Authentication username and password. ('bearer_token' and +# ## 'bearer_token_string' take priority) +# # username = "" +# # password = "" +# +# ## Specify timeout duration for slower prometheus clients (default is 3s) +# # response_timeout = "3s" +# +# ## Optional TLS Config +# # tls_ca = /path/to/cafile +# # tls_cert = /path/to/certfile +# # tls_key = /path/to/keyfile +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # SFlow V5 Protocol Listener +# [[inputs.sflow]] +# ## Address to listen for sFlow packets. +# ## example: service_address = "udp://:6343" +# ## service_address = "udp4://:6343" +# ## service_address = "udp6://:6343" +# service_address = "udp://:6343" +# +# ## Set the size of the operating system's receive buffer. +# ## example: read_buffer_size = "64KiB" +# # read_buffer_size = "" + + +# # Receive SNMP traps +# [[inputs.snmp_trap]] +# ## Transport, local address, and port to listen on. Transport must +# ## be "udp://". Omit local address to listen on all interfaces. +# ## example: "udp://127.0.0.1:1234" +# ## +# ## Special permissions may be required to listen on a port less than +# ## 1024. See README.md for details +# ## +# # service_address = "udp://:162" +# ## Timeout running snmptranslate command +# # timeout = "5s" +# ## Snmp version, defaults to 2c +# # version = "2c" +# ## SNMPv3 authentication and encryption options. +# ## +# ## Security Name. +# # sec_name = "myuser" +# ## Authentication protocol; one of "MD5", "SHA" or "". +# # auth_protocol = "MD5" +# ## Authentication password. +# # auth_password = "pass" +# ## Security Level; one of "noAuthNoPriv", "authNoPriv", or "authPriv". +# # sec_level = "authNoPriv" +# ## Privacy protocol used for encrypted messages; one of "DES", "AES", "AES192", "AES192C", "AES256", "AES256C" or "". +# # priv_protocol = "" +# ## Privacy password used for encrypted messages. +# # priv_password = "" + + +# # Generic socket listener capable of handling multiple socket types. +# [[inputs.socket_listener]] +# ## URL to listen on +# # service_address = "tcp://:8094" +# # service_address = "tcp://127.0.0.1:http" +# # service_address = "tcp4://:8094" +# # service_address = "tcp6://:8094" +# # service_address = "tcp6://[2001:db8::1]:8094" +# # service_address = "udp://:8094" +# # service_address = "udp4://:8094" +# # service_address = "udp6://:8094" +# # service_address = "unix:///tmp/telegraf.sock" +# # service_address = "unixgram:///tmp/telegraf.sock" +# +# ## Change the file mode bits on unix sockets. These permissions may not be +# ## respected by some platforms, to safely restrict write permissions it is best +# ## to place the socket into a directory that has previously been created +# ## with the desired permissions. +# ## ex: socket_mode = "777" +# # socket_mode = "" +# +# ## Maximum number of concurrent connections. +# ## Only applies to stream sockets (e.g. TCP). +# ## 0 (default) is unlimited. +# # max_connections = 1024 +# +# ## Read timeout. +# ## Only applies to stream sockets (e.g. TCP). +# ## 0 (default) is unlimited. +# # read_timeout = "30s" +# +# ## Optional TLS configuration. +# ## Only applies to stream sockets (e.g. TCP). +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Enables client authentication if set. +# # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Maximum socket buffer size (in bytes when no unit specified). +# ## For stream sockets, once the buffer fills up, the sender will start backing up. +# ## For datagram sockets, once the buffer fills up, metrics will start dropping. +# ## Defaults to the OS default. +# # read_buffer_size = "64KiB" +# +# ## Period between keep alive probes. +# ## Only applies to TCP sockets. +# ## 0 disables keep alive probes. +# ## Defaults to the OS configuration. +# # keep_alive_period = "5m" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# # data_format = "influx" +# +# ## Content encoding for message payloads, can be set to "gzip" to or +# ## "identity" to apply no encoding. +# # content_encoding = "identity" + + +# # Statsd UDP/TCP Server +# [[inputs.statsd]] +# ## Protocol, must be "tcp", "udp", "udp4" or "udp6" (default=udp) +# protocol = "udp" +# +# ## MaxTCPConnection - applicable when protocol is set to tcp (default=250) +# max_tcp_connections = 250 +# +# ## Enable TCP keep alive probes (default=false) +# tcp_keep_alive = false +# +# ## Specifies the keep-alive period for an active network connection. +# ## Only applies to TCP sockets and will be ignored if tcp_keep_alive is false. +# ## Defaults to the OS configuration. +# # tcp_keep_alive_period = "2h" +# +# ## Address and port to host UDP listener on +# service_address = ":8125" +# +# ## The following configuration options control when telegraf clears it's cache +# ## of previous values. If set to false, then telegraf will only clear it's +# ## cache when the daemon is restarted. +# ## Reset gauges every interval (default=true) +# delete_gauges = true +# ## Reset counters every interval (default=true) +# delete_counters = true +# ## Reset sets every interval (default=true) +# delete_sets = true +# ## Reset timings & histograms every interval (default=true) +# delete_timings = true +# +# ## Percentiles to calculate for timing & histogram stats +# percentiles = [50.0, 90.0, 99.0, 99.9, 99.95, 100.0] +# +# ## separator to use between elements of a statsd metric +# metric_separator = "_" +# +# ## Parses tags in the datadog statsd format +# ## http://docs.datadoghq.com/guides/dogstatsd/ +# parse_data_dog_tags = false +# +# ## Parses datadog extensions to the statsd format +# datadog_extensions = false +# +# ## Statsd data translation templates, more info can be read here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/TEMPLATE_PATTERN.md +# # templates = [ +# # "cpu.* measurement*" +# # ] +# +# ## Number of UDP messages allowed to queue up, once filled, +# ## the statsd server will start dropping packets +# allowed_pending_messages = 10000 +# +# ## Number of timing/histogram values to track per-measurement in the +# ## calculation of percentiles. Raising this limit increases the accuracy +# ## of percentiles but also increases the memory usage and cpu time. +# percentile_limit = 1000 + + +# # Suricata stats plugin +# [[inputs.suricata]] +# ## Data sink for Suricata stats log +# # This is expected to be a filename of a +# # unix socket to be created for listening. +# source = "/var/run/suricata-stats.sock" +# +# # Delimiter for flattening field keys, e.g. subitem "alert" of "detect" +# # becomes "detect_alert" when delimiter is "_". +# delimiter = "_" + + +# # Accepts syslog messages following RFC5424 format with transports as per RFC5426, RFC5425, or RFC6587 +# [[inputs.syslog]] +# ## Specify an ip or hostname with port - eg., tcp://localhost:6514, tcp://10.0.0.1:6514 +# ## Protocol, address and port to host the syslog receiver. +# ## If no host is specified, then localhost is used. +# ## If no port is specified, 6514 is used (RFC5425#section-4.1). +# server = "tcp://:6514" +# +# ## TLS Config +# # tls_allowed_cacerts = ["/etc/telegraf/ca.pem"] +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## Period between keep alive probes. +# ## 0 disables keep alive probes. +# ## Defaults to the OS configuration. +# ## Only applies to stream sockets (e.g. TCP). +# # keep_alive_period = "5m" +# +# ## Maximum number of concurrent connections (default = 0). +# ## 0 means unlimited. +# ## Only applies to stream sockets (e.g. TCP). +# # max_connections = 1024 +# +# ## Read timeout is the maximum time allowed for reading a single message (default = 5s). +# ## 0 means unlimited. +# # read_timeout = "5s" +# +# ## The framing technique with which it is expected that messages are transported (default = "octet-counting"). +# ## Whether the messages come using the octect-counting (RFC5425#section-4.3.1, RFC6587#section-3.4.1), +# ## or the non-transparent framing technique (RFC6587#section-3.4.2). +# ## Must be one of "octet-counting", "non-transparent". +# # framing = "octet-counting" +# +# ## The trailer to be expected in case of non-transparent framing (default = "LF"). +# ## Must be one of "LF", or "NUL". +# # trailer = "LF" +# +# ## Whether to parse in best effort mode or not (default = false). +# ## By default best effort parsing is off. +# # best_effort = false +# +# ## Character to prepend to SD-PARAMs (default = "_"). +# ## A syslog message can contain multiple parameters and multiple identifiers within structured data section. +# ## Eg., [id1 name1="val1" name2="val2"][id2 name1="val1" nameA="valA"] +# ## For each combination a field is created. +# ## Its name is created concatenating identifier, sdparam_separator, and parameter name. +# # sdparam_separator = "_" + + +# # Parse the new lines appended to a file +# [[inputs.tail]] +# ## File names or a pattern to tail. +# ## These accept standard unix glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## "/var/log/**.log" -> recursively find all .log files in /var/log +# ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log +# ## "/var/log/apache.log" -> just tail the apache log file +# ## +# ## See https://github.com/gobwas/glob for more examples +# ## +# files = ["/var/mymetrics.out"] +# +# ## Read file from beginning. +# # from_beginning = false +# +# ## Whether file is a named pipe +# # pipe = false +# +# ## Method used to watch for file updates. Can be either "inotify" or "poll". +# # watch_method = "inotify" +# +# ## Maximum lines of the file to process that have not yet be written by the +# ## output. For best throughput set based on the number of metrics on each +# ## line and the size of the output's metric_batch_size. +# # max_undelivered_lines = 1000 +# +# ## Character encoding to use when interpreting the file contents. Invalid +# ## characters are replaced using the unicode replacement character. When set +# ## to the empty string the data is not decoded to text. +# ## ex: character_encoding = "utf-8" +# ## character_encoding = "utf-16le" +# ## character_encoding = "utf-16be" +# ## character_encoding = "" +# # character_encoding = "" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Generic TCP listener +# [[inputs.tcp_listener]] +# # DEPRECATED: the TCP listener plugin has been deprecated in favor of the +# # socket_listener plugin +# # see https://github.com/influxdata/telegraf/tree/master/plugins/inputs/socket_listener + + +# # Generic UDP listener +# [[inputs.udp_listener]] +# # DEPRECATED: the TCP listener plugin has been deprecated in favor of the +# # socket_listener plugin +# # see https://github.com/influxdata/telegraf/tree/master/plugins/inputs/socket_listener + + +# # Read metrics from VMware vCenter +# [[inputs.vsphere]] +# ## List of vCenter URLs to be monitored. These three lines must be uncommented +# ## and edited for the plugin to work. +# vcenters = [ "https://vcenter.local/sdk" ] +# username = "user@corp.local" +# password = "secret" +# +# ## VMs +# ## Typical VM metrics (if omitted or empty, all metrics are collected) +# # vm_include = [ "/*/vm/**"] # Inventory path to VMs to collect (by default all are collected) +# # vm_exclude = [] # Inventory paths to exclude +# vm_metric_include = [ +# "cpu.demand.average", +# "cpu.idle.summation", +# "cpu.latency.average", +# "cpu.readiness.average", +# "cpu.ready.summation", +# "cpu.run.summation", +# "cpu.usagemhz.average", +# "cpu.used.summation", +# "cpu.wait.summation", +# "mem.active.average", +# "mem.granted.average", +# "mem.latency.average", +# "mem.swapin.average", +# "mem.swapinRate.average", +# "mem.swapout.average", +# "mem.swapoutRate.average", +# "mem.usage.average", +# "mem.vmmemctl.average", +# "net.bytesRx.average", +# "net.bytesTx.average", +# "net.droppedRx.summation", +# "net.droppedTx.summation", +# "net.usage.average", +# "power.power.average", +# "virtualDisk.numberReadAveraged.average", +# "virtualDisk.numberWriteAveraged.average", +# "virtualDisk.read.average", +# "virtualDisk.readOIO.latest", +# "virtualDisk.throughput.usage.average", +# "virtualDisk.totalReadLatency.average", +# "virtualDisk.totalWriteLatency.average", +# "virtualDisk.write.average", +# "virtualDisk.writeOIO.latest", +# "sys.uptime.latest", +# ] +# # vm_metric_exclude = [] ## Nothing is excluded by default +# # vm_instances = true ## true by default +# +# ## Hosts +# ## Typical host metrics (if omitted or empty, all metrics are collected) +# # host_include = [ "/*/host/**"] # Inventory path to hosts to collect (by default all are collected) +# # host_exclude [] # Inventory paths to exclude +# host_metric_include = [ +# "cpu.coreUtilization.average", +# "cpu.costop.summation", +# "cpu.demand.average", +# "cpu.idle.summation", +# "cpu.latency.average", +# "cpu.readiness.average", +# "cpu.ready.summation", +# "cpu.swapwait.summation", +# "cpu.usage.average", +# "cpu.usagemhz.average", +# "cpu.used.summation", +# "cpu.utilization.average", +# "cpu.wait.summation", +# "disk.deviceReadLatency.average", +# "disk.deviceWriteLatency.average", +# "disk.kernelReadLatency.average", +# "disk.kernelWriteLatency.average", +# "disk.numberReadAveraged.average", +# "disk.numberWriteAveraged.average", +# "disk.read.average", +# "disk.totalReadLatency.average", +# "disk.totalWriteLatency.average", +# "disk.write.average", +# "mem.active.average", +# "mem.latency.average", +# "mem.state.latest", +# "mem.swapin.average", +# "mem.swapinRate.average", +# "mem.swapout.average", +# "mem.swapoutRate.average", +# "mem.totalCapacity.average", +# "mem.usage.average", +# "mem.vmmemctl.average", +# "net.bytesRx.average", +# "net.bytesTx.average", +# "net.droppedRx.summation", +# "net.droppedTx.summation", +# "net.errorsRx.summation", +# "net.errorsTx.summation", +# "net.usage.average", +# "power.power.average", +# "storageAdapter.numberReadAveraged.average", +# "storageAdapter.numberWriteAveraged.average", +# "storageAdapter.read.average", +# "storageAdapter.write.average", +# "sys.uptime.latest", +# ] +# ## Collect IP addresses? Valid values are "ipv4" and "ipv6" +# # ip_addresses = ["ipv6", "ipv4" ] +# +# # host_metric_exclude = [] ## Nothing excluded by default +# # host_instances = true ## true by default +# +# +# ## Clusters +# # cluster_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected) +# # cluster_exclude = [] # Inventory paths to exclude +# # cluster_metric_include = [] ## if omitted or empty, all metrics are collected +# # cluster_metric_exclude = [] ## Nothing excluded by default +# # cluster_instances = false ## false by default +# +# ## Datastores +# # datastore_include = [ "/*/datastore/**"] # Inventory path to datastores to collect (by default all are collected) +# # datastore_exclude = [] # Inventory paths to exclude +# # datastore_metric_include = [] ## if omitted or empty, all metrics are collected +# # datastore_metric_exclude = [] ## Nothing excluded by default +# # datastore_instances = false ## false by default +# +# ## Datacenters +# # datacenter_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected) +# # datacenter_exclude = [] # Inventory paths to exclude +# datacenter_metric_include = [] ## if omitted or empty, all metrics are collected +# datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default. +# # datacenter_instances = false ## false by default +# +# ## Plugin Settings +# ## separator character to use for measurement and field names (default: "_") +# # separator = "_" +# +# ## number of objects to retrieve per query for realtime resources (vms and hosts) +# ## set to 64 for vCenter 5.5 and 6.0 (default: 256) +# # max_query_objects = 256 +# +# ## number of metrics to retrieve per query for non-realtime resources (clusters and datastores) +# ## set to 64 for vCenter 5.5 and 6.0 (default: 256) +# # max_query_metrics = 256 +# +# ## number of go routines to use for collection and discovery of objects and metrics +# # collect_concurrency = 1 +# # discover_concurrency = 1 +# +# ## the interval before (re)discovering objects subject to metrics collection (default: 300s) +# # object_discovery_interval = "300s" +# +# ## timeout applies to any of the api request made to vcenter +# # timeout = "60s" +# +# ## When set to true, all samples are sent as integers. This makes the output +# ## data types backwards compatible with Telegraf 1.9 or lower. Normally all +# ## samples from vCenter, with the exception of percentages, are integer +# ## values, but under some conditions, some averaging takes place internally in +# ## the plugin. Setting this flag to "false" will send values as floats to +# ## preserve the full precision when averaging takes place. +# # use_int_samples = true +# +# ## Custom attributes from vCenter can be very useful for queries in order to slice the +# ## metrics along different dimension and for forming ad-hoc relationships. They are disabled +# ## by default, since they can add a considerable amount of tags to the resulting metrics. To +# ## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include +# ## to select the attributes you want to include. +# ## By default, since they can add a considerable amount of tags to the resulting metrics. To +# ## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include +# ## to select the attributes you want to include. +# # custom_attribute_include = [] +# # custom_attribute_exclude = ["*"] +# +# ## Optional SSL Config +# # ssl_ca = "/path/to/cafile" +# # ssl_cert = "/path/to/certfile" +# # ssl_key = "/path/to/keyfile" +# ## Use SSL but skip chain & host verification +# # insecure_skip_verify = false + + +# # A Webhooks Event collector +# [[inputs.webhooks]] +# ## Address and port to host Webhook listener on +# service_address = ":1619" +# +# [inputs.webhooks.filestack] +# path = "/filestack" +# +# [inputs.webhooks.github] +# path = "/github" +# # secret = "" +# +# [inputs.webhooks.mandrill] +# path = "/mandrill" +# +# [inputs.webhooks.rollbar] +# path = "/rollbar" +# +# [inputs.webhooks.papertrail] +# path = "/papertrail" +# +# [inputs.webhooks.particle] +# path = "/particle" + + +# # This plugin implements the Zipkin http server to gather trace and timing data needed to troubleshoot latency problems in microservice architectures. +# [[inputs.zipkin]] +# # path = "/api/v1/spans" # URL path for span data +# # port = 9411 # Port on which Telegraf listens + diff --git a/playbooks/roles/telegraf/files/telegraf_gpu.conf b/playbooks/roles/telegraf/files/telegraf_nvidia_gpu.conf similarity index 99% rename from playbooks/roles/telegraf/files/telegraf_gpu.conf rename to playbooks/roles/telegraf/files/telegraf_nvidia_gpu.conf index 2cbe5c82..3f11b523 100755 --- a/playbooks/roles/telegraf/files/telegraf_gpu.conf +++ b/playbooks/roles/telegraf/files/telegraf_nvidia_gpu.conf @@ -25,7 +25,7 @@ # Configuration for telegraf agent [agent] ## Default data collection interval for all inputs - interval = "10s" + interval = "30s" ## Rounds collection interval to 'interval' ## ie, if interval="10s" then always collect on :00, :10, :20, etc. round_interval = true diff --git a/playbooks/roles/telegraf/tasks/common.yml b/playbooks/roles/telegraf/tasks/common.yml index a1e079bf..05df721d 100644 --- a/playbooks/roles/telegraf/tasks/common.yml +++ b/playbooks/roles/telegraf/tasks/common.yml @@ -1,12 +1,37 @@ --- -- name: Create database - shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['controller'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" - +#- name: Create database +# shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['controller'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" +# when: ('controller' in group_names) + #- name: Create database # influxdb_database: # hostname: "{{ hostvars[groups['controller'][0]]['ansible_fqdn'] }}" # database_name: "telegraf" # run_once: true +- name: Add influxdb repository + become: true + yum_repository: + name: influxdb + description: InfluxDB Repository - RHEL $releasever + baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable + enabled: 1 + gpgcheck: 1 + gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key + when: ansible_os_family == 'RedHat' + +- name: Add InfluxData's key + become: true + apt_key: + state: present + url: https://repos.influxdata.com/influxdata-archive_compat.key + when: ansible_os_family == 'Debian' + +- name: Manage InfluxData APT repositories + become: true + apt_repository: + repo: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable + state: present + when: ansible_os_family == 'Debian' - name: Install telegraf vars: @@ -16,15 +41,38 @@ include_role: name: safe_yum +- name: force telegraf gid 998 + become: true + lineinfile: + path: /etc/group + state: present + regexp: '^telegraf:x:(.*)$' + line: 'telegraf:x:998:' + backrefs: yes + +- name: force telegraf uid 998 + become: true + lineinfile: + path: /etc/passwd + state: present + regexp: '^telegraf:x:(.*)$' + line: 'telegraf:x:998:998::/etc/telegraf:/bin/false' + backrefs: yes + - name: Check for nvidia-smi shell: nvidia-smi register: nvidiasmi ignore_errors: yes +- name: Check for rocm-smi + shell: rocm-smi + register: rocmsmi + ignore_errors: yes + - name: copy telegraf.conf become: true copy: - src: "{% if nvidiasmi is failed %}telegraf.conf{% else%}telegraf_gpu.conf{% endif %}" + src: "{% if nvidiasmi is failed %}{% if rocmsmi is failed %}telegraf.conf{% else%}telegraf_amd_gpu.conf{% endif %}{% else%}telegraf_nvidia_gpu.conf{% endif %}" dest: /etc/telegraf/telegraf.conf force: yes backup: yes @@ -44,7 +92,7 @@ mode: 0744 with_items: - infiniband.conf - - influxdb.conf + - prometheus.conf - net.conf - ethtool_counters.conf - infiniband_mlx5_0_hw_counters.conf @@ -64,6 +112,35 @@ - infiniband_mlx5_14_hw_counters.conf - infiniband_mlx5_15_hw_counters.conf - infiniband_mlx5_16_hw_counters.conf + +- name: render conf files + become: true + template: + src: "{{ item }}.j2" + dest: /etc/telegraf/telegraf.d/{{item}} + force: yes + backup: yes + owner: telegraf + group: telegraf + mode: 0744 + with_items: + - nvidia_gpu.conf + when: not nvidiasmi is failed + +- name: render conf files + become: true + template: + src: "{{ item }}.j2" + dest: /etc/telegraf/telegraf.d/{{item}} + force: yes + backup: yes + owner: telegraf + group: telegraf + mode: 0744 + with_items: + - amd_gpu.conf + when: not rocmsmi is failed + - name: restart telegraf become: true service: diff --git a/playbooks/roles/telegraf/templates/amd_gpu.conf.j2 b/playbooks/roles/telegraf/templates/amd_gpu.conf.j2 new file mode 100644 index 00000000..e69de29b diff --git a/playbooks/roles/telegraf/templates/custom.cnf.j2 b/playbooks/roles/telegraf/templates/custom.cnf.j2 new file mode 100644 index 00000000..b89e6bfb --- /dev/null +++ b/playbooks/roles/telegraf/templates/custom.cnf.j2 @@ -0,0 +1,3 @@ +[[inputs.exec]] +commands = ["echo {}"] +data_format = "json" \ No newline at end of file diff --git a/playbooks/roles/telegraf/templates/nvidia_gpu.conf.j2 b/playbooks/roles/telegraf/templates/nvidia_gpu.conf.j2 new file mode 100644 index 00000000..d044edf8 --- /dev/null +++ b/playbooks/roles/telegraf/templates/nvidia_gpu.conf.j2 @@ -0,0 +1 @@ +[[inputs.nvidia_smi]] \ No newline at end of file diff --git a/playbooks/roles/telegraf/templates/prometheus.conf.j2 b/playbooks/roles/telegraf/templates/prometheus.conf.j2 new file mode 100755 index 00000000..e7cab267 --- /dev/null +++ b/playbooks/roles/telegraf/templates/prometheus.conf.j2 @@ -0,0 +1,3 @@ +[[outputs.prometheus_client]] + listen = ":9273" + expiration_interval = "60s" \ No newline at end of file diff --git a/playbooks/site.yml b/playbooks/site.yml index 72cfcca0..a76a6afd 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -1,4 +1,3 @@ - - hosts: compute become: true vars: @@ -9,10 +8,10 @@ tasks: - include_role: name: hostname - when: slurm | default(false) | bool + when: change_hostname | default(false) | bool # for ubuntu, on all compute nodes, run --fix-broken install -- hosts: compute, login +- hosts: compute, login, monitoring become: true tasks: - include_role: @@ -51,21 +50,15 @@ - include_role: name: localdisk when: localdisk | default(true) | bool + - include_role: + name: healthchecks - hosts: compute become: true gather_facts: true tasks: - - include_role: - name: oci-cn-auth - when: cluster_network|bool and not use_compute_agent|default(false)|bool - - include_role: - name: rdma-interface - when: cluster_network|bool and not use_compute_agent|default(false)|bool - include_role: name: nvidia_peermem - - include_role: - name: healthchecks - hosts: controller become: true @@ -97,7 +90,7 @@ name: fss-home when: add_nfs|bool and home_fss|bool -- hosts: controller, slurm_backup, login +- hosts: controller, slurm_backup, login, monitoring become: true tasks: - include_role: @@ -127,18 +120,12 @@ - include_role: name: cluster-cli when: ldap|default(true)|bool - -# configure if instance_principal is False -- hosts: controller - become: true - tasks: - include_role: name: no_instance_principal when: not inst_prin|bool - -- hosts: compute, login +- hosts: compute, login, monitoring become: true tasks: - include_role: @@ -267,27 +254,38 @@ name: nccl-conf when: cluster_network|bool -- hosts: all +- hosts: all,!monitoring tasks: - include_role: - name: influxdb - when: monitoring|default(false)|bool + name: metrics-exporter + when: cluster_monitoring|default(false)|bool + +- hosts: monitoring + tasks: - include_role: - name: telegraf - when: monitoring|default(false)|bool + name: grafana + when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length > 0 ) - hosts: controller tasks: - include_role: name: grafana - when: monitoring|default(false)|bool + when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length == 0 ) + +- hosts: controller, monitoring + tasks: + - include_role: + name: prometheus + when: cluster_monitoring|default(false)|bool + +- hosts: controller + tasks: - include_role: name: autoscaling_mon when: autoscaling_monitoring|default(false)|bool - include_role: name: cron - - hosts: compute become: true vars: diff --git a/playbooks/slurm_config.yml b/playbooks/slurm_config.yml index dce70f01..ce9c15f0 100755 --- a/playbooks/slurm_config.yml +++ b/playbooks/slurm_config.yml @@ -1,4 +1,4 @@ -- hosts: controller,slurm_backup,compute,login +- hosts: controller,slurm_backup,compute,login, monitoring gather_facts: true vars: destroy: false diff --git a/queues.conf b/queues.conf index 351846b2..0e872200 100644 --- a/queues.conf +++ b/queues.conf @@ -6,7 +6,8 @@ - name: hpc-default default: true shape: ${shape} - instance_keyword: hpc + change_hostname: ${change_hostname} + hostname_convention: ${hostname_convention} # Will add -INDEX with the index of the IP in the subnet permanent: false cluster_network: ${cluster_network} compute_cluster: ${compute_cluster} @@ -27,10 +28,11 @@ instance_pool_custom_memory: ${instance_pool_custom_memory} marketplace_listing: ${marketplace_listing} hyperthreading: ${hyperthreading} - - name: permanent + - name: ${cluster_name} default: false shape: ${shape} - instance_keyword: permanent + change_hostname: ${change_hostname} + hostname_convention: ${hostname_convention} # Will add -INDEX with the index of the IP in the subnet permanent: true cluster_network: ${cluster_network} compute_cluster: ${compute_cluster} diff --git a/samples/gpu/rccl_run_allreduce.sbatch b/samples/gpu/rccl_run_allreduce.sbatch new file mode 100644 index 00000000..b8a341e4 --- /dev/null +++ b/samples/gpu/rccl_run_allreduce.sbatch @@ -0,0 +1,48 @@ +#!/bin/bash +#SBATCH --job-name=nccl-allreduce-slurm +#SBATCH --nodes=2 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --exclusive +export PMI_DEBUG=1 + + +cd /nfs/cluster +mkdir $SLURM_JOB_ID +cd $SLURM_JOB_ID + +MACHINEFILE="hostfile" + +scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE +echo MACHINEFILE +cat $MACHINEFILE + +source /etc/os-release + +mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + +if [[ "$mpivars_path" == "" ]]; then + mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` +fi + +if [[ "$mpivars_path" == "" ]]; then + echo "Could not find MPIPATH"; exit; fi + +source $mpivars_path + +var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_7,mlx5_8,mlx5_9" + + + mpirun --mca pml ucx \ + --bind-to numa \ +-x UCX_NET_DEVICES=mlx5_0:1 \ +-x NCCL_SOCKET_IFNAME=eth0 \ +-x NCCL_IB_SL=0 \ +-x NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_7,mlx5_8,mlx5_9" \ +-x coll_hcoll_enable=0 \ +-x HCOLL_ENABLE_MCAST_ALL=0 \ +-x NCCL_IGNORE_CPU_AFFINITY=1 \ +-x NCCL_IB_QPS_PER_CONNECTION=4 \ +-x RX_QUEUE_LEN=8192 \ +-x IB_RX_QUEUE_LEN=8192 \ +-np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/rccl-tests/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 \ No newline at end of file diff --git a/schema.yaml b/schema.yaml index 4bc43ba9..fedf118d 100755 --- a/schema.yaml +++ b/schema.yaml @@ -56,7 +56,6 @@ variableGroups: - ${node_count} - ${hyperthreading} - ${boot_volume_size} - - ${use_compute_agent} - ${use_marketplace_image} - ${compute_username} - ${marketplace_listing} @@ -71,6 +70,8 @@ variableGroups: - ${access_ctrl} - ${numa_nodes_per_socket} - ${percentage_of_cores_enabled} + - ${change_hostname} + - ${hostname_convention} - title: "Additional Login Node" variables: @@ -92,6 +93,26 @@ variableGroups: - ${login_block} - ${login_block_volume_size} - ${login_block_volume_performance} + + - title: "Cluster Monitoring" + variables: + - ${cluster_monitoring} + - ${monitoring_node} + - ${monitoring_ad} + - ${monitoring_shape} + - ${monitoring_ocpus} + - ${monitoring_ocpus_denseIO_flex} + - ${monitoring_custom_memory} + - ${monitoring_memory} + - ${monitoring_boot_volume_size} + - ${use_marketplace_image_monitoring} + - ${marketplace_listing_monitoring} + - ${unsupported_monitoring} + - ${monitoring_image_compartment} + - ${custom_monitoring_image} + - ${unsupported_monitoring_image} + - ${monitoring_username} + - title: Autoscaling variables: - ${autoscaling} @@ -103,7 +124,7 @@ variableGroups: - ${api_user_ocid} - ${api_fingerprint} - ${api_user_key} - - title: "Monitoring" + - title: "Autoscaling Monitoring" variables: - ${autoscaling_mysql_service} - ${monitoring_shape_name} @@ -117,6 +138,7 @@ variableGroups: variables: - ${add_nfs} - ${create_fss} + - ${mount_target_count} - ${nfs_target_path} - ${nfs_source_IP} - ${nfs_source_path} @@ -176,7 +198,6 @@ variableGroups: - ${rack_aware} - ${queue} - ${spack} - - ${monitoring} - ${enroot} - ${pyxis} - ${pam} @@ -201,6 +222,9 @@ variableGroups: - ${controller_boot_volume_backup_retention_seconds} - ${controller_boot_volume_backup_time_zone} - ${controller_boot_volume_backup_type} + - ${use_compute_agent} + - ${nfs_list_of_mount_target_IPs} + visible: false - title: "Debug" variables: @@ -416,12 +440,10 @@ variables: description: "Marketplace listing to use" required: true enum: - - "HPC_OL7" - "HPC_OL8" + - "GPU_OL8_NV560" - "GPU_OL8_NV550" - - "GPU_OL7_NV550" - "GPU_OL8_NV535" - - "GPU_OL7_NV535" default: "HPC_OL8" visible: ${use_marketplace_image_controller} @@ -435,7 +457,7 @@ variables: unsupported_controller_image: title: "Image OCID" - description: "Custom image ID for compute nodes. Please note that only Oracle Linux 7 and Ubuntu 20.04 are supported as controller image at this moment." + description: "Custom image ID for compute nodes. Please note that only Oracle Linux 7 and Ubuntu 22.04 are supported as controller image at this moment." type: string required: true visible: ${unsupported_controller} @@ -455,7 +477,7 @@ variables: custom_controller_image: title: "controller Image ID" - description: "Custom image ID for controller nodes. Please note that only Oracle Linux 7, 8 and Ubuntu 20.04 are supported as controller image at this moment. " + description: "Custom image ID for controller nodes. Please note that only Oracle Linux 7, 8 and Ubuntu 22.04 are supported as controller image at this moment. " type: oci:core:image:id dependsOn: compartmentId: ${controller_image_compartment} @@ -472,7 +494,7 @@ variables: required: true minimum: 50 title: "Size of the boot volume in GB" - default: 100 + default: 1024 controller_boot_volume_backup: type: boolean @@ -505,6 +527,7 @@ variables: - "0. Lower performance" - "10. Balanced performance" - "20. High Performance" + - "30. Ultra High Performance" default: "10. Balanced performance" visible: and: @@ -590,9 +613,11 @@ variables: - "BM.GPU.B4.8" - "BM.GPU.A100-v2.8" - "BM.GPU.H100.8" + - "BM.GPU.MI300X.8" + - "BM.GPU.L40S.4" - "BM.Optimized3.36" - "BM.HPC.E5.144" - default: "BM.HPC2.36" + default: "BM.GPU.H100.8" title: "Shape of the Compute Nodes" description: "Shape of compute nodes used in permanent/initial cluster" required: true @@ -745,7 +770,7 @@ variables: required: true minimum: 50 title: "Size of the boot volume in GB" - default: 100 + default: 256 description: "Boot volume size in GB of each compute node" use_marketplace_image: @@ -760,24 +785,13 @@ variables: description: "Marketplace listing to use" required: true enum: - - "HPC_OL7" - "HPC_OL8" + - "GPU_OL8_NV560" - "GPU_OL8_NV550" - - "GPU_OL7_NV550" - "GPU_OL8_NV535" - - "GPU_OL7_NV535" default: "GPU_OL8_NV550" visible: ${use_marketplace_image} - use_compute_agent: - type: boolean - title: "use compute agent" - description: "Select if your image has the OCA agent rather than the oci-cn-auth package. The new marketplace images need the compute agent enabled." - default: true - visible: - not: - - ${use_marketplace_image} - compute_image_compartment: title: "compute image compartment" type: oci:identity:compartment:id @@ -791,7 +805,7 @@ variables: image: title: "Image" - description: "Custom image ID for compute nodes. Supported OS are OL7, OL8, CentOS7 and Ubuntu 20.04" + description: "Custom image ID for compute nodes. Supported OS are OL8, CentOS7 and Ubuntu 22.04 (OL7 should still work but is not supported anymore)" type: oci:core:image:id required: true dependsOn: @@ -825,7 +839,7 @@ variables: image_ocid: title: "Image OCID" - description: "Custom image ID for compute nodes. Supported OS are OL7, OL8, CentOS7 and Ubuntu 20.04" + description: "Custom image ID for compute nodes. Supported OS are OL8, CentOS7 and Ubuntu 22.04 (OL7 should still work but is not supported anymore)" type: string required: true visible: @@ -837,8 +851,7 @@ variables: BIOS: title: "Modify BIOS options" - description: "WARNING : Do NOT change those if you have not tested the changes on a single instance. Error will be \"Shape does not support the provided platform -configuration\" " + description: "WARNING : Do NOT change those if you have not tested the changes on a single instance. Error will be \"Shape does not support the provided platform configuration\" " type: boolean default: false visible: true @@ -895,6 +908,20 @@ configuration\" " default: "Default" visible: ${BIOS} + change_hostname: + title: "Change hostname" + description: "Will modify the hostname of the node but not in the oci console" + type: boolean + default: true + required: true + + hostname_convention: + type: string + title: "Hostname Convention" + description: "Will add -INDEX at the end with the index of the IP in the subnet" + default: "GPU" + visible: ${change_hostname} + use_advanced: type: boolean title: "Show advanced storage options" @@ -1038,12 +1065,6 @@ configuration\" " compartmentId: ${vcn_compartment} vcnId: ${vcn_id} hidePrivateSubnet: false - # visible: - # and: - # - not: - # - ${private_deployment} - # - and: - # - ${use_existing_vcn} visible: ${use_existing_vcn} required: true private_subnet_id: @@ -1117,9 +1138,6 @@ configuration\" " default: "0.0.0.0/0" description: "Allowed SSH network in CIDR notation" required: true -# visible: -# not: -# - ${use_existing_vcn} slurm: type: boolean title: "Install SLURM" @@ -1184,6 +1202,12 @@ configuration\" " description: "Install Enroot, Nvidia Container Toolkit, and docker." visible: ${slurm} + cluster_monitoring: + type: boolean + title: "Install HPC Cluster Monitoring Tools" + default: false + description: "Install Grafana, Node-Exporter, and Prometheus tools for system monitoring." + pam: type: boolean title: "Enable PAM" @@ -1204,12 +1228,6 @@ configuration\" " default: false description: "Will run tests on GPU nodes before starting a job. Nodes that are showing issues will be set in drain state" visible: ${slurm} - - monitoring: - type: boolean - title: "Install HPC Cluster Monitoring Tools" - default: false - description: "Install Grafana, Telegrapf, and InfluxDB tools for system monitoring." autoscaling: type: boolean @@ -1275,7 +1293,18 @@ configuration\" " default: false description: "For FSS, leave options filed empty." visible: ${add_nfs} - + + mount_target_count: + title: "Number of FSS mount targets" + description: "Number of FSS Mount Target to use with File System" + type: integer + default: 1 + required: true + visible: + and: + - ${add_nfs} + - ${create_fss} + fss_compartment: title: "FSS compartment" description: "Compartment to add the FSS Mount Target and File System" @@ -1574,7 +1603,7 @@ configuration\" " required: true minimum: 50 title: "Size of the boot volume in GB" - default: 250 + default: 512 visible: ${login_node} login_block: @@ -1634,7 +1663,7 @@ configuration\" " custom_login_image: title: "Login Image ID" - description: "Custom image ID for login nodes. Please note that only Oracle Linux and Ubuntu 20.04 are supported as login image at this moment. " + description: "Custom image ID for login nodes. Please note that only Oracle Linux and Ubuntu 22.04 are supported as login image at this moment. " type: oci:core:image:id dependsOn: compartmentId: ${login_image_compartment} @@ -1679,14 +1708,242 @@ configuration\" " description: "Marketplace listing to use" required: true enum: - - "HPC_OL7" - "HPC_OL8" + - "GPU_OL8_NV560" - "GPU_OL8_NV550" - - "GPU_OL7_NV550" - "GPU_OL8_NV535" - - "GPU_OL7_NV535" default: "HPC_OL8" visible: and: - ${use_marketplace_image_login} - ${login_node} + + + + monitoring_node: + type: boolean + title: "Monitoring Node" + default: false + description: "Create an additional monitoring node" + visible: cluster_monitoring + + monitoring_ad: + type: oci:identity:availabilitydomain:name + dependsOn: + compartmentId: ${targetCompartment} + visible: + and: + - complexExpression + - ${monitoring_node} + required: true + description: "Availability Domain for monitoring node" + title: "Availability Domain For monitoring Node" + default: ${ad} + + monitoring_shape: + type: oci:core:instanceshape:name + dependsOn: + compartmentId: ${targetCompartment} + required: true + default: VM.Standard.E4.Flex + visible: ${monitoring_node} + + monitoring_ocpus: + type: integer + description: Number of OCPU's for flex shape + minimum: 1 + maximum: 64 + default: 32 + visible: + and: + - or: + - eq: + - ${monitoring_shape} + - "VM.Standard.E3.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.E4.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.E5.Flex" + - eq: + - ${monitoring_shape} + - "VM.Optimized3.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.A1.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard3.Flex" + - ${monitoring_node} + required: true + + monitoring_ocpus_denseIO_flex: + title: Cores + type: enum + description: Number of OCPU's for Dense IO flex shape + enum: + - 8 + - 16 + - 32 + default: 16 + visible: + and: + - or: + - eq: + - ${monitoring_shape} + - "VM.DenseIO.E4.Flex" + - eq: + - ${monitoring_shape} + - "VM.DenseIO.E5.Flex" + - ${monitoring_node} + required: true + + monitoring_custom_memory: + title: Use custom memory size + type: boolean + default: false + visible: + and: + - or: + - eq: + - ${monitoring_shape} + - "VM.Standard.E3.Flex" + - eq: + - ${monitoring_shape} + - "VM.Optimized3.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.E4.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.E5.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.A1.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard3.Flex" + - ${monitoring_node} + monitoring_memory: + title: Memory in GBS + type: integer + description: Number of memory for flex shape. Minimum 1GB per core. + minimum: 1 + maximum: 1024 + default: 256 + visible: + and: + - and: + - or: + - eq: + - ${monitoring_shape} + - "VM.Standard.E3.Flex" + - eq: + - ${monitoring_shape} + - "VM.Optimized3.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.E4.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.E5.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.A1.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard3.Flex" + - and: + - ${monitoring_custom_memory} + - ${monitoring_node} + required: true + + monitoring_boot_volume_size: + type: integer + required: true + minimum: 50 + title: "Size of the boot volume in GB" + default: 512 + visible: ${monitoring_node} + + unsupported_monitoring: + title: "Use unsupported image" + description: "Custom image ID for monitoring Node" + type: boolean + default: false + visible: + and: + - ${monitoring_node} + - not: + - ${use_marketplace_image_monitoring} + + monitoring_image_compartment: + title: "monitoring image compartment" + type: oci:identity:compartment:id + default: ${targetCompartment} + visible: + and: + - ${monitoring_node} + - not: + - ${unsupported_monitoring} + - not: + - ${use_marketplace_image_monitoring} + required: true + + custom_monitoring_image: + title: "monitoring Image ID" + description: "Custom image ID for monitoring nodes. Please note that only Oracle Linux and Ubuntu 22.04 are supported as monitoring image at this moment. " + type: oci:core:image:id + dependsOn: + compartmentId: ${monitoring_image_compartment} + visible: + and: + - ${monitoring_node} + - not: + - ${unsupported_monitoring} + - not: + - ${use_marketplace_image_monitoring} + required: true + unsupported_monitoring_image: + title: "Image OCID" + description: "Custom image ID for monitoring nodes" + type: string + required: true + visible: + and: + - ${unsupported_monitoring} + - not: + - ${use_marketplace_image_monitoring} + default: "image.ocid" + + monitoring_username: + title: "Default username for monitoring node" + description: "Custom image ID for monitoring node" + type: string + default: "opc" + required: true + visible: ${monitoring_node} + + use_marketplace_image_monitoring: + type: boolean + title: "use marketplace image" + description: "Use marketplace image, otherwise provide custom image OCID" + default: true + visible: ${monitoring_node} + + marketplace_listing_monitoring: + type: enum + title: "Image version" + description: "Marketplace listing to use" + required: true + enum: + - "HPC_OL8" + - "GPU_OL8_NV560" + - "GPU_OL8_NV550" + - "GPU_OL8_NV535" + default: "HPC_OL8" + visible: + and: + - ${use_marketplace_image_monitoring} + - ${monitoring_node} diff --git a/scripts/ib_write_bw.sh b/scripts/ib_write_bw.sh index 951afe84..14b65360 100644 --- a/scripts/ib_write_bw.sh +++ b/scripts/ib_write_bw.sh @@ -1,263 +1,35 @@ #!/bin/bash -#ib_write_bw.sh -#This script can be used to check ib_write_bw between two gpu nodes in the cluster. -#Currently supported shapes are BM.GPU.B4.8,BM.GPU.A100-v2.8,BM.GPU4.8 -#If cuda is installed on the node, script execution will recompile perftest with cuda. -dis_help() -{ - echo - echo "Usage:" - echo - echo "./ib_write_bw.sh -s -n -c -g " - echo - echo "Options:" - echo "s Server hostname" - echo "n Client hostname." - echo "c Enable cuda (Default: Disabled)" - echo "g GPU id (Default: 0)" - echo "h Print this help." - echo - echo "Logs are stored at /tmp/logs" - echo - echo "e.g., sh ./ib_write_bw.sh -s compute-permanent-node-1 -n compute-permanent-node-2 -c y -g 2" - echo - echo "Supported shapes: BM.GPU.B4.8,BM.GPU.A100-v2.8,BM.GPU4.8" - echo -} - -#Exit if no arguments passed -if [ "$#" -eq 0 ] -then - dis_help - exit 1 -fi - -#Display options -gid=0 -cuda=n -while getopts "s:n:c:g:h" option -do - case $option in - s) server=${OPTARG};; - n) client=${OPTARG};; - c) cuda=${OPTARG};; - g) gid=${OPTARG};; - h) dis_help - exit;; - \?) # Invalid option - echo "Error: Invalid option" - exit;; - esac -done - - -#Check node shape -shape=`ssh $server 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape'` -if [ "$shape" == \"BM.GPU.B4.8\" ] || [ "$shape" == \"BM.GPU.A100-v2.8\" ] || [ "$shape" == \"BM.GPU4.8\" ]; -then -echo -echo "Shape: $shape" -echo "Server: $server" -echo "Client: $client" -echo "Cuda: $cuda" -echo "GPU id: $gid" +# run ib_write_bw between two nodes +# If on bastion: ./ibbw.sh +# If on one compute node: ./ibbw.sh + +Server=$1 +Client=${2:-localhost} +Dev=${3:-mlx5_17} +shape=`ssh $Server 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq -r .shape'` +if [ "$shape" == "BM.GPU4.8" ]; then + HCA="mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9 mlx5_10 mlx5_11 mlx5_12 mlx5_13 mlx5_14 mlx5_15 mlx5_16 mlx5_17" +elif [ "$shape" == "BM.GPU.A100-v2.8" ]; then + HCA="mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_5 mlx5_6 mlx5_7 mlx5_8 mlx5_9 mlx5_10 mlx5_11 mlx5_12 mlx5_14 mlx5_15 mlx5_16 mlx5_17" +elif [ "$shape" == "BM.GPU.H100.8" ]; then + HCA="mlx5_0 mlx5_1 mlx5_3 mlx5_4 mlx5_5 mlx5_6 mlx5_7 mlx5_8 mlx5_9 mlx5_10 mlx5_12 mlx5_13 mlx5_14 mlx5_15 mlx5_16 mlx5_17" +elif [ "$shape" == "BM.GPU.B4.8" ]; then + exit "Not tested yet" + HCA="mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_5 mlx5_6 mlx5_7 mlx5_8 mlx5_9 mlx5_10 mlx5_11 mlx5_12 mlx5_14 mlx5_15 mlx5_16 mlx5_17" else - echo - echo "Shape $shape is not supported by this script" - dis_help -exit -fi - -#Set variables -cuda_path=`ssh $server /usr/sbin/alternatives --list|grep cuda | awk -F" " '{print $3}'|tail -1`/targets/x86_64-linux/include/cuda.h -server_ip=`grep $server /etc/hosts |grep -v rdma|awk '{print $1}'` -logdir=/tmp/logs/ib_bw/`date +%F-%H` -outdir=/tmp/ib_bw/ -gpu_count=`ssh $server nvidia-smi -L |wc -l` - -#check cuda installation -ssh -q $server [[ -f $cuda_path ]] && echo " " || echo "Please check cuda installation; exit 1"; - -#Set interface to be skipped based on node shape -if [ "$shape" == \"BM.GPU.B4.8\" ] || [ "$shape" == \"BM.GPU.A100-v2.8\" ] -then -skip_if=mlx5_0 - elif [ "$shape" == \"BM.GPU4.8\" ] - then - skip_if=mlx5_4 -fi - -#Validate GPU ID -if [ "$gid" -gt "$gpu_count" ] -then -echo -echo "GPU id value should be less than or equal to the total number of GPUs installed. That is $gpu_count" -exit 1 -fi - -#Check active interfaces -echo -printf "Checking interfaces...\n" -srv_if_count=`ssh $server ibv_devinfo |egrep "hca_id|state"|tac|sed '/PORT_DOWN/I,+1d'|tac|sed -e '/PORT_ACTIVE/d'|awk -F: '{print $2}'|sed 's/[[:space:]]//g'|sort -t _ -k2.2|grep -v $skip_if|wc -l` -client_if_count=`ssh $client ibv_devinfo |egrep "hca_id|state"|tac|sed '/PORT_DOWN/I,+1d'|tac|sed -e '/PORT_ACTIVE/d'|awk -F: '{print $2}'|sed 's/[[:space:]]//g'|sort -t _ -k2.2|grep -v $skip_if|wc -l` - -if [ "$srv_if_count" != "$client_if_count" ] -then - echo - echo "Active interfaces are different on both nodes. Please fix it before running this script" - echo "Interface count on server: $srv_if_count" - echo "Interface count on client: $client_if_count" - exit 1 -fi - -#Generate ansible playbook -if [ "$cuda" == "y" ] || [ "$cuda" == "yes" ]; -then -cat > /tmp/ib_bw_gpu.yml << EOF ---- -- hosts: all - become: true - tasks: - - name: check cuda - stat: - path: $cuda_path - register: cuda_data - - - block: - - name: yum remove perftest - yum: - name: perftest - state: absent - - - name: Git checkout perftest - ansible.builtin.git: - repo: 'https://github.com/linux-rdma/perftest.git' - dest: /tmp/perftest - - - name: Run autogen.sh - ansible.builtin.shell: /tmp/perftest/autogen.sh - args: - chdir: /tmp/perftest - - - name: Run configure - ansible.builtin.shell: ./configure CUDA_H_PATH=$cuda_path - args: - chdir: /tmp/perftest - - - name: Build 'all' target with extra arguments - make: - chdir: /tmp/perftest - target: all - - - name: Copy files - shell: cp /tmp/perftest/ib_* /usr/bin - when: - - use_cuda is defined - - use_cuda == "yes" or use_cuda == "y" - - cuda_data.stat.exists -EOF - -#Create ansible inventory -cat > /tmp/inventory << EOF -$server -$client -EOF -ansible-playbook /tmp/ib_bw_gpu.yml -i /tmp/inventory -e "use_cuda=$cuda" -fi - -#Set interface to be skipped based on node shape -shape=`ssh $server 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape'` -if [ "$shape" == \"BM.GPU.B4.8\" ] || [ "$shape" == \"BM.GPU.A100-v2.8\" ] -then -skip_if=mlx5_0 - elif [ "$shape" == \"BM.GPU4.8\" ] - then - skip_if=mlx5_4 -fi - -#Check active interfaces -printf "Testing active interfaces...\n" -echo -ssh $server ibv_devinfo |egrep "hca_id|state"|tac|sed '/PORT_DOWN/I,+1d'|tac|sed -e '/PORT_ACTIVE/d'|awk -F: '{print $2}'|sed 's/[[:space:]]//g'|sort -t _ -k2.2|grep -v $skip_if - -#Generate server script -cat > /tmp/ib_server.sh << 'EOF' -#! /bin/bash - -out_dir=/tmp/ib_bw -mkdir -p $out_dir -shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` -if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] -then -skip_if=mlx5_0 - elif [ $shape == \"BM.GPU4.8\" ] - then - skip_if=mlx5_4 -fi -for interface in `ibv_devinfo |egrep "hca_id|state"|tac|sed '/PORT_DOWN/I,+1d'|tac|sed -e '/PORT_ACTIVE/d'|awk -F: '{print $2}'|sed 's/[[:space:]]//g'|sort -t _ -k2.2|grep -v $skip_if` -do -echo -echo "Server Interface: $interface" -echo -ib_write_bw -d $interface -a -F &> $out_dir/ib_server-$interface -sleep 10 -done -EOF - -#Generate client script -cat > /tmp/ib_client.sh << 'EOF' -#! /bin/bash - -out_dir=/tmp/ib_bw -mkdir -p $out_dir -#interfaces -shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` -if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] -then -skip_if=mlx5_0 - elif [ $shape == \"BM.GPU4.8\" ] - then - skip_if=mlx5_4 -fi -for interface in `ibv_devinfo |egrep "hca_id|state"|tac|sed '/PORT_DOWN/I,+1d'|tac|sed -e '/PORT_ACTIVE/d'|awk -F: '{print $2}'|sed 's/[[:space:]]//g'|sort -t _ -k2.2|grep -v $skip_if` -do -ib_write_bw -d $interface -F $server_ip -D 10 --cpu_util --report_gbits &> $out_dir/ib_client-$interface -cat $out_dir/ib_client-$interface -sleep 15 -done -EOF - -#Update server ip in ib_client.sh -sed -i "/#interfaces/a server_ip=$server_ip" /tmp/ib_client.sh -chmod +x /tmp/ib_server.sh /tmp/ib_client.sh - -#Update scripts to use cuda if selected -if [ "$cuda" == "yes" ] || [ "$cuda" == "y" ]; -then - sed -i 's/ib_write_bw.*/ib_write_bw -d $interface --use_cuda=0 -F > $out_dir\/ib_server-$interface/g' /tmp/ib_server.sh - sed -i 's/ib_write_bw.*/ib_write_bw -d $interface --use_cuda=0 -D 10 -I 0 $server_ip --cpu_util --report_gbits/g' /tmp/ib_client.sh - sed -i -e "s/--use_cuda=0/--use_cuda=${gid:=0}/g" /tmp/ib_server.sh - sed -i -e "s/--use_cuda=0/--use_cuda=${gid:=0}/g" /tmp/ib_client.sh -fi -echo - -#Copy and run scripts -scp /tmp/ib_server.sh $server:/tmp -scp /tmp/ib_client.sh $client:/tmp -ssh $server "/tmp/ib_server.sh" & -ssh $client "/tmp/ib_client.sh" - -#Sync results to controller -mkdir -p $logdir -rsync -a opc@$client:$outdir $logdir - -#Generate test summary -echo -echo "************** Test Summary **************" -for i in `ls -ltr $logdir | awk -F" " '{print $9}'|awk -F- '{print $2}'`; do -echo -echo Server interface: $i | tee -a /tmp/ib_write_bw_log.txt -echo -grep -A2 MsgRate $logdir/ib_client-$i | tee -a /tmp/ib_write_bw_log.txt -done + echo "Shape $shape not supported" +fi +cmd_base="/usr/bin/ib_write_bw -a -F -q 2 --report_gbits" +cmd_base="/usr/bin/ib_write_bw -b -F -q 2 -x 3 --report_gbits" +cmd_base="/usr/bin/ib_write_bw -b -F -q 2 -x 3 --report_gbits" +for Dev in $HCA; do + echo -e "$Server $Client $Dev \c" + ssh $Server exec $cmd_base -d $Dev > /dev/null 2>&1 & + # make sure the server start listening before client make requests + sleep 1 + BW=`ssh $Client $cmd_base -d $Dev $Server | grep "65536 10000" | awk '{print $3}'` + #BW=`ssh $Client $cmd_base -d $Dev $Server ` + echo "$BW" + #wait +done \ No newline at end of file diff --git a/scripts/mlx_firmware_update.sh b/scripts/mlx_firmware_update.sh deleted file mode 100644 index 3c8fb2eb..00000000 --- a/scripts/mlx_firmware_update.sh +++ /dev/null @@ -1,164 +0,0 @@ -#!/bin/bash -# mlx_firmware_update.sh - -# This script updates the roce_tx_window_en setting and oci-cn-auth package. -# It needs mandatory one argument which is a hostfile (one host per line). -# After updating, it also returns the roce_tx_window_en setting and oci-cn-auth version. -# If you specify the optional 2nd argument "check", then it will not update but only return the current roce_tx_window_en setting -# and oci-cn-auth version. -# Example: -# ./mlx_firmware_update.sh hosts -# ./mlx_firmware_update.sh hosts check - -MODE=update - -# check if host file is passed -if [ -n "$1" ]; then - HOST_FILE=$1 -else - echo "scriptname " - echo "host file is missing, pass a file with list of hostname, one host per line" - exit 1; -fi - -# optional parameter to check the changes -if [ -n "$2" ]; then - if [ "$2" == "check" ]; then - MODE="check" - fi -fi - - - -# check if ubuntu or oracle -source /etc/os-release - -if [ $ID == "ol" ] ; then - echo "oracle" - USERNAME=opc -fi - -if [ $ID == "ubuntu" ] ; then - echo "ubuntu" - USERNAME=ubuntu -fi - - -function check_roce_tx_window_en { - cat > ./check_roce_tx_window_en.sh << EOF -#!/bin/bash -# check roce_tx_window_en setting -# -# -mlxreg=\$(which mlxreg) -shape=\$(curl -q -s 169.254.169.254/opc/v1/instance/shape) -for pci_id in \$(cat /opt/oci-hpc/oci-cn-auth/configs/shapes.json | jq '.["hpc-shapes"]' | jq ".[] | select(.shape==\"\$shape\") " | jq -r '.["rdma-nics"] | .[].pci') ; do -echo \$pci_id ; \$mlxreg --yes -d \$pci_id --reg_name ROCE_ACCL --get | grep roce_tx_window_en -done - -EOF - -chmod +x ./check_roce_tx_window_en.sh - -for h in `less $HOST_FILE` ; - do - echo $h - scp ./check_roce_tx_window_en.sh $USERNAME@$h:/tmp/ - done - - -for h in `less $HOST_FILE` ; - do - echo $h - ssh $USERNAME@$h "sudo /tmp/check_roce_tx_window_en.sh" - done - } - -function check_oci_cn_auth_version { - for h in `less $HOST_FILE` ; - do - echo $h - ssh $USERNAME@$h "cat /opt/oci-hpc/oci-cn-auth/.version-oci_cn_auth" - done -} - -if [ $MODE == "check" ] ; then - check_roce_tx_window_en - check_oci_cn_auth_version - -else - - -# generate ./update_roce_tx_window_en.sh file -cat > ./update_roce_tx_window_en.sh << EOF -#!/bin/bash -# Script to set roce_tx_window_en=0 -# -# -mlxreg=\$(which mlxreg) -shape=\$(curl -q -s 169.254.169.254/opc/v1/instance/shape) -for pci_id in \$(cat /opt/oci-hpc/oci-cn-auth/configs/shapes.json | jq '.["hpc-shapes"]' | jq ".[] | select(.shape==\"\$shape\") " | jq -r '.["rdma-nics"] | .[].pci') ; do -echo \$pci_id ; \$mlxreg --yes -d \$pci_id --reg_name ROCE_ACCL --set roce_tx_window_en=0 -done - -EOF - -chmod +x ./update_roce_tx_window_en.sh - -# generate install file -cat > ./install_oci-cn-auth-package.sh << EOF -#!/bin/bash - -#DEBIAN_FRONTEND=noninteractive - -# check if ubuntu or oracle -source /etc/os-release - -# download file -UBUNTU_PACKAGE_URL="https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/F7gihhVuJbrnsV8KjAMA7XblkZYRBYJ2xAH2FPmaIJrgtYcuy5wJRWAQXMfw9hLD/n/hpc/b/source/o/oci-cn-auth_2.1.4-compute_all.deb" -UBUNTU_PACKAGE="/tmp/oci-cn-auth_2.1.4-compute_all.deb" -ORACLE_PACKAGE_URL="https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/F7gihhVuJbrnsV8KjAMA7XblkZYRBYJ2xAH2FPmaIJrgtYcuy5wJRWAQXMfw9hLD/n/hpc/b/source/o/oci-cn-auth-2.1.4-compute.el7.noarch.rpm" -ORACLE_PACKAGE="/tmp/oci-cn-auth-2.1.4-compute.el7.noarch.rpm" - - -if [ \$ID == "ol" ] ; then - echo "oracle" - USERNAME=opc - wget -O \$ORACLE_PACKAGE \$ORACLE_PACKAGE_URL - sudo yum localinstall -y -q \$ORACLE_PACKAGE -fi - -if [ \$ID == "ubuntu" ] ; then - echo "ubuntu" - USERNAME=ubuntu - wget -O \$UBUNTU_PACKAGE \$UBUNTU_PACKAGE_URL - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -q \$UBUNTU_PACKAGE -fi - - -EOF - -chmod +x ./install_oci-cn-auth-package.sh - - -# Run for loop to copy file to all nodes and execute them -for h in `less $HOST_FILE` ; - do - echo $h - scp ./install_oci-cn-auth-package.sh $USERNAME@$h:/tmp/ - scp ./update_roce_tx_window_en.sh $USERNAME@$h:/tmp/ - done - -#exit 0 - -for h in `less $HOST_FILE` ; - do - echo $h - ssh $USERNAME@$h "sudo /tmp/update_roce_tx_window_en.sh" - ssh $USERNAME@$h "sudo /tmp/install_oci-cn-auth-package.sh" - done - -check_roce_tx_window_en -check_oci_cn_auth_version - -fi diff --git a/slurm_ha.tf b/slurm_ha.tf index b6d9f72a..20cd095d 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -1,15 +1,15 @@ -resource "oci_core_volume_attachment" "backup_volume_attachment" { - count = var.controller_block && var.slurm_ha ? 1 : 0 +resource "oci_core_volume_attachment" "backup_volume_attachment" { + count = var.controller_block && var.slurm_ha ? 1 : 0 attachment_type = "iscsi" volume_id = oci_core_volume.controller_volume[0].id instance_id = oci_core_instance.backup[0].id display_name = "${local.cluster_name}-backup-volume-attachment" device = "/dev/oracleoci/oraclevdb" is_shareable = true -} +} resource "oci_core_instance" "backup" { - count = var.slurm_ha ? 1 : 0 + count = var.slurm_ha ? 1 : 0 depends_on = [oci_core_subnet.public-subnet] availability_domain = var.controller_ad compartment_id = var.targetCompartment @@ -17,18 +17,18 @@ resource "oci_core_instance" "backup" { dynamic "shape_config" { for_each = local.is_controller_flex_shape - content { - ocpus = shape_config.value - memory_in_gbs = var.controller_custom_memory ? var.controller_memory : 16 * shape_config.value - } + content { + ocpus = shape_config.value + memory_in_gbs = var.controller_custom_memory ? var.controller_memory : 16 * shape_config.value + } } agent_config { is_management_disabled = true - } - display_name = "${local.cluster_name}-backup" + } + display_name = "${local.cluster_name}-backup" freeform_tags = { - "cluster_name" = local.cluster_name + "cluster_name" = local.cluster_name "parent_cluster" = local.cluster_name } @@ -37,33 +37,34 @@ resource "oci_core_instance" "backup" { user_data = base64encode(data.template_file.controller_config.rendered) } source_details { -// source_id = var.use_standard_image ? data.oci_core_images.linux.images.0.id : local.custom_controller_image_ocid - source_id = local.controller_image + // source_id = var.use_standard_image ? data.oci_core_images.linux.images.0.id : local.custom_controller_image_ocid + source_id = local.controller_image boot_volume_size_in_gbs = var.controller_boot_volume_size - source_type = "image" + boot_volume_vpus_per_gb = 30 + source_type = "image" } create_vnic_details { - subnet_id = local.controller_subnet_id + subnet_id = local.controller_subnet_id assign_public_ip = local.controller_bool_ip } -} +} -resource "null_resource" "backup" { - count = var.slurm_ha ? 1 : 0 - depends_on = [oci_core_instance.backup] - triggers = { +resource "null_resource" "backup" { + count = var.slurm_ha ? 1 : 0 + depends_on = [oci_core_instance.backup] + triggers = { backup = oci_core_instance.backup[0].id - } + } provisioner "remote-exec" { inline = [ "#!/bin/bash", - "sudo mkdir -p /opt/oci-hpc", + "sudo mkdir -p /opt/oci-hpc", "sudo chown ${var.controller_username}:${var.controller_username} /opt/oci-hpc/", "mkdir -p /opt/oci-hpc/bin", "mkdir -p /opt/oci-hpc/playbooks" - ] + ] connection { host = local.host_backup type = "ssh" @@ -72,8 +73,8 @@ resource "null_resource" "backup" { } } provisioner "file" { - source = "playbooks" - destination = "/opt/oci-hpc/" + source = "playbooks" + destination = "/opt/oci-hpc/" connection { host = local.host_backup type = "ssh" @@ -134,11 +135,11 @@ resource "null_resource" "backup" { private_key = tls_private_key.ssh.private_key_pem } } - provisioner "file" { - content = templatefile("${path.module}/configure.tpl", { + provisioner "file" { + content = templatefile("${path.module}/configure.tpl", { configure = var.configure }) - destination = "/tmp/configure.conf" + destination = "/tmp/configure.conf" connection { host = local.host_backup type = "ssh" @@ -166,7 +167,7 @@ resource "null_resource" "backup" { "cp /home/${var.controller_username}/.ssh/cluster.key /home/${var.controller_username}/.ssh/id_rsa", "chmod a+x /opt/oci-hpc/bin/*.sh", "timeout --foreground 60m /opt/oci-hpc/bin/controller.sh" - ] + ] connection { host = local.host_backup type = "ssh" @@ -175,87 +176,93 @@ resource "null_resource" "backup" { } } } -resource "null_resource" "cluster_backup" { - count = var.slurm_ha ? 1 : 0 - depends_on = [null_resource.backup, oci_core_compute_cluster.compute_cluster, oci_core_cluster_network.cluster_network, oci_core_instance.backup ] - triggers = { +resource "null_resource" "cluster_backup" { + count = var.slurm_ha ? 1 : 0 + depends_on = [null_resource.backup, oci_core_compute_cluster.compute_cluster, oci_core_cluster_network.cluster_network, oci_core_instance.backup] + triggers = { cluster_instances = join(", ", local.cluster_instances_names) - } + } provisioner "file" { - content = templatefile("${path.module}/inventory.tpl", { - controller_name = oci_core_instance.controller.display_name, - controller_ip = oci_core_instance.controller.private_ip, - backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", - backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", - login_name = var.login_node ? oci_core_instance.login[0].display_name : "", - login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", - compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) - public_subnet = data.oci_core_subnet.public_subnet.cidr_block, - private_subnet = data.oci_core_subnet.private_subnet.cidr_block, - rdma_network = cidrhost(var.rdma_subnet, 0), - rdma_netmask = cidrnetmask(var.rdma_subnet), - zone_name = local.zone_name, - dns_entries = var.dns_entries, - nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "", - home_nfs = var.home_nfs, - create_fss = var.create_fss, - home_fss = var.home_fss, - scratch_nfs = var.use_scratch_nfs && var.node_count > 0, - cluster_nfs = var.use_cluster_nfs, - cluster_nfs_path = var.cluster_nfs_path, - scratch_nfs_path = var.scratch_nfs_path, - add_nfs = var.add_nfs, - nfs_target_path = var.nfs_target_path, - nfs_source_IP = local.nfs_source_IP, - nfs_source_path = var.nfs_source_path, - nfs_options = var.nfs_options, - localdisk = var.localdisk, - log_vol = var.log_vol, - redundancy = var.redundancy, - cluster_network = var.cluster_network, - use_compute_agent = var.use_compute_agent, - slurm = var.slurm, - slurm_nfs_path = var.slurm_nfs ? var.nfs_source_path : var.cluster_nfs_path, - rack_aware = var.rack_aware, - spack = var.spack, - ldap = var.ldap, - controller_block = var.controller_block, - login_block = var.login_block, - scratch_nfs_type = local.scratch_nfs_type, - controller_mount_ip = local.controller_mount_ip, - login_mount_ip = local.login_mount_ip, - cluster_mount_ip = local.mount_ip, - autoscaling = var.autoscaling, - cluster_name = local.cluster_name, - shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, - instance_pool_ocpus = local.instance_pool_ocpus, - queue=var.queue, - monitoring = var.monitoring, - hyperthreading = var.hyperthreading, - controller_username = var.controller_username, - compute_username = var.compute_username, - autoscaling_monitoring = var.autoscaling_monitoring, + content = templatefile("${path.module}/inventory.tpl", { + controller_name = oci_core_instance.controller.display_name, + controller_ip = oci_core_instance.controller.private_ip, + backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", + backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip : "", + login_name = var.login_node ? oci_core_instance.login[0].display_name : "", + login_ip = var.login_node ? oci_core_instance.login[0].private_ip : "", + monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "", + monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip : "", + compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([], []) + public_subnet = data.oci_core_subnet.public_subnet.cidr_block, + private_subnet = data.oci_core_subnet.private_subnet.cidr_block, + rdma_network = cidrhost(var.rdma_subnet, 0), + rdma_netmask = cidrnetmask(var.rdma_subnet), + zone_name = local.zone_name, + dns_entries = var.dns_entries, + nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "", + home_nfs = var.home_nfs, + create_fss = var.create_fss, + home_fss = var.home_fss, + scratch_nfs = var.use_scratch_nfs && var.node_count > 0, + cluster_nfs = var.use_cluster_nfs, + cluster_nfs_path = var.cluster_nfs_path, + scratch_nfs_path = var.scratch_nfs_path, + add_nfs = var.add_nfs, + nfs_target_path = var.nfs_target_path, + nfs_source_IP = local.nfs_source_IP, + nfs_source_path = var.nfs_source_path, + nfs_options = var.nfs_options, + localdisk = var.localdisk, + log_vol = var.log_vol, + redundancy = var.redundancy, + cluster_network = var.cluster_network, + use_compute_agent = var.use_compute_agent, + slurm = var.slurm, + slurm_nfs_path = var.slurm_nfs ? var.nfs_source_path : var.cluster_nfs_path, + rack_aware = var.rack_aware, + spack = var.spack, + ldap = var.ldap, + controller_block = var.controller_block, + login_block = var.login_block, + scratch_nfs_type = local.scratch_nfs_type, + controller_mount_ip = local.controller_mount_ip, + login_mount_ip = local.login_mount_ip, + cluster_mount_ip = local.mount_ip, + autoscaling = var.autoscaling, + cluster_name = local.cluster_name, + shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, + instance_pool_ocpus = local.instance_pool_ocpus, + queue = var.queue, + cluster_monitoring = var.cluster_monitoring, + hyperthreading = var.hyperthreading, + controller_username = var.controller_username, + compute_username = var.compute_username, + autoscaling_monitoring = var.autoscaling_monitoring, autoscaling_mysql_service = var.autoscaling_mysql_service, - monitoring_mysql_ip = var.autoscaling_monitoring && var.autoscaling_mysql_service ? oci_mysql_mysql_db_system.monitoring_mysql_db_system[0].ip_address : "localhost", - admin_password = var.admin_password, - admin_username = var.autoscaling_mysql_service ? var.admin_username : "root", - enroot = var.enroot, - pyxis = var.pyxis, - pam = var.pam, - sacct_limits = var.sacct_limits, - privilege_sudo = var.privilege_sudo, - privilege_group_name = var.privilege_group_name, - latency_check = var.latency_check, - inst_prin = var.inst_prin, - region = var.region, - tenancy_ocid = var.tenancy_ocid, - api_fingerprint = var.api_fingerprint, - api_user_ocid = var.api_user_ocid, - healthchecks = var.healthchecks - }) + monitoring_mysql_ip = var.autoscaling_monitoring && var.autoscaling_mysql_service ? oci_mysql_mysql_db_system.monitoring_mysql_db_system[0].ip_address : "localhost", + admin_password = var.admin_password, + admin_username = var.autoscaling_mysql_service ? var.admin_username : "root", + enroot = var.enroot, + pyxis = var.pyxis, + pam = var.pam, + sacct_limits = var.sacct_limits, + privilege_sudo = var.privilege_sudo, + privilege_group_name = var.privilege_group_name, + latency_check = var.latency_check, + inst_prin = var.inst_prin, + region = var.region, + tenancy_ocid = var.tenancy_ocid, + api_fingerprint = var.api_fingerprint, + api_user_ocid = var.api_user_ocid, + healthchecks = var.healthchecks, + change_hostname = var.change_hostname, + hostname_convention = var.hostname_convention, + change_hostname = var.change_hostname, + hostname_convention = var.hostname_convention + }) - destination = "/opt/oci-hpc/playbooks/inventory" + destination = "/opt/oci-hpc/playbooks/inventory" connection { host = local.host_backup type = "ssh" @@ -266,7 +273,7 @@ resource "null_resource" "cluster_backup" { provisioner "file" { - content = var.node_count > 0 ? join("\n",local.cluster_instances_ips) : "\n" + content = var.node_count > 0 ? join("\n", local.cluster_instances_ips) : "\n" destination = "/tmp/hosts" connection { host = local.host_backup @@ -277,14 +284,14 @@ resource "null_resource" "cluster_backup" { } provisioner "file" { - content = templatefile(var.inst_prin ? "${path.module}/autoscaling/provider_inst_prin.tpl" : "${path.module}/autoscaling/provider_user.tpl", { - api_user_ocid = var.api_user_ocid, - api_fingerprint = var.api_fingerprint, + content = templatefile(var.inst_prin ? "${path.module}/autoscaling/provider_inst_prin.tpl" : "${path.module}/autoscaling/provider_user.tpl", { + api_user_ocid = var.api_user_ocid, + api_fingerprint = var.api_fingerprint, private_key_path = "/opt/oci-hpc/autoscaling/credentials/key.pem", - tenancy_ocid = var.tenancy_ocid - }) + tenancy_ocid = var.tenancy_ocid + }) - destination = "/opt/oci-hpc/autoscaling/tf_init/provider.tf" + destination = "/opt/oci-hpc/autoscaling/tf_init/provider.tf" connection { host = local.host_backup type = "ssh" @@ -294,28 +301,32 @@ resource "null_resource" "cluster_backup" { } provisioner "file" { - content = templatefile("${path.module}/queues.conf", { - cluster_network = var.cluster_network, - use_compute_agent = var.use_compute_agent, - compute_cluster = var.compute_cluster, - marketplace_listing = var.marketplace_listing, - image = local.image_ocid, - use_marketplace_image = var.use_marketplace_image, - boot_volume_size = var.boot_volume_size, - shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, - region = var.region, - ad = var.use_multiple_ads? join(" ", [var.ad, var.secondary_ad, var.third_ad]) : var.ad, - private_subnet = var.private_subnet, - private_subnet_id = var.private_subnet_id, - targetCompartment = var.targetCompartment, - instance_pool_ocpus = local.instance_pool_ocpus, - instance_pool_memory = var.instance_pool_memory, + content = templatefile("${path.module}/queues.conf", { + cluster_network = var.cluster_network, + use_compute_agent = var.use_compute_agent, + compute_cluster = var.compute_cluster, + marketplace_listing = var.marketplace_listing, + image = local.image_ocid, + use_marketplace_image = var.use_marketplace_image, + boot_volume_size = var.boot_volume_size, + shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, + region = var.region, + ad = var.use_multiple_ads ? join(" ", [var.ad, var.secondary_ad, var.third_ad]) : var.ad, + private_subnet = var.private_subnet, + private_subnet_id = var.private_subnet_id, + targetCompartment = var.targetCompartment, + instance_pool_ocpus = local.instance_pool_ocpus, + instance_pool_memory = var.instance_pool_memory, instance_pool_custom_memory = var.instance_pool_custom_memory, - queue=var.queue, - hyperthreading = var.hyperthreading - }) + queue = var.queue, + hyperthreading = var.hyperthreading, + cluster_name = local.cluster_name, + change_hostname = var.change_hostname, + hostname_convention = var.hostname_convention - destination = "/opt/oci-hpc/conf/queues.conf" + }) + + destination = "/opt/oci-hpc/conf/queues.conf" connection { host = local.host_backup type = "ssh" @@ -323,88 +334,92 @@ resource "null_resource" "cluster_backup" { private_key = tls_private_key.ssh.private_key_pem } } - + provisioner "file" { - content = templatefile("${path.module}/conf/variables.tpl", { - controller_name = oci_core_instance.controller.display_name, - controller_ip = oci_core_instance.controller.private_ip, - backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", - backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", - login_name = var.login_node ? oci_core_instance.login[0].display_name : "", - login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", - compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) - public_subnet = data.oci_core_subnet.public_subnet.cidr_block, - public_subnet_id = local.controller_subnet_id, - private_subnet = data.oci_core_subnet.private_subnet.cidr_block, - private_subnet_id = local.subnet_id, - rdma_subnet = var.rdma_subnet, - nfs = var.node_count > 0 && var.use_scratch_nfs ? local.cluster_instances_names[0] : "", - scratch_nfs = var.use_scratch_nfs && var.node_count > 0, - scratch_nfs_path = var.scratch_nfs_path, - use_scratch_nfs = var.use_scratch_nfs, - slurm = var.slurm, - slurm_nfs_path = var.add_nfs ? var.nfs_source_path : var.cluster_nfs_path - rack_aware = var.rack_aware, - spack = var.spack, - ldap = var.ldap, - controller_block = var.controller_block, - login_block = var.login_block, - scratch_nfs_type = local.scratch_nfs_type, - controller_mount_ip = local.controller_mount_ip, - login_mount_ip = local.login_mount_ip, - cluster_mount_ip = local.mount_ip, - scratch_nfs_type_cluster = var.scratch_nfs_type_cluster, - scratch_nfs_type_pool = var.scratch_nfs_type_pool, + content = templatefile("${path.module}/conf/variables.tpl", { + controller_name = oci_core_instance.controller.display_name, + controller_ip = oci_core_instance.controller.private_ip, + backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", + backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip : "", + login_name = var.login_node ? oci_core_instance.login[0].display_name : "", + login_ip = var.login_node ? oci_core_instance.login[0].private_ip : "", + monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "", + monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip : "", + compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([], []) + public_subnet = data.oci_core_subnet.public_subnet.cidr_block, + public_subnet_id = local.controller_subnet_id, + private_subnet = data.oci_core_subnet.private_subnet.cidr_block, + private_subnet_id = local.subnet_id, + rdma_subnet = var.rdma_subnet, + nfs = var.node_count > 0 && var.use_scratch_nfs ? local.cluster_instances_names[0] : "", + scratch_nfs = var.use_scratch_nfs && var.node_count > 0, + scratch_nfs_path = var.scratch_nfs_path, + use_scratch_nfs = var.use_scratch_nfs, + slurm = var.slurm, + slurm_nfs_path = var.add_nfs ? var.nfs_source_path : var.cluster_nfs_path + rack_aware = var.rack_aware, + spack = var.spack, + ldap = var.ldap, + controller_block = var.controller_block, + login_block = var.login_block, + scratch_nfs_type = local.scratch_nfs_type, + controller_mount_ip = local.controller_mount_ip, + login_mount_ip = local.login_mount_ip, + cluster_mount_ip = local.mount_ip, + scratch_nfs_type_cluster = var.scratch_nfs_type_cluster, + scratch_nfs_type_pool = var.scratch_nfs_type_pool, controller_block_volume_performance = var.controller_block_volume_performance, - region = var.region, - tenancy_ocid = var.tenancy_ocid, - vcn_subnet = var.vcn_subnet, - vcn_id = local.vcn_id, - zone_name = local.zone_name, - dns_entries = var.dns_entries, - cluster_block_volume_size = var.cluster_block_volume_size, - cluster_block_volume_performance = var.cluster_block_volume_performance, - ssh_cidr = var.ssh_cidr, - use_cluster_nfs = var.use_cluster_nfs, - cluster_nfs_path = var.cluster_nfs_path, - home_nfs = var.home_nfs, - create_fss = var.create_fss, - home_fss = var.home_fss, - add_nfs = var.add_nfs, - nfs_target_path = var.nfs_target_path, - nfs_source_IP = local.nfs_source_IP, - nfs_source_path = var.nfs_source_path, - nfs_options = var.nfs_options, - localdisk = var.localdisk, - log_vol = var.log_vol, - redundancy = var.redundancy, - monitoring = var.monitoring, - hyperthreading = var.hyperthreading, - unsupported = var.unsupported, - autoscaling_monitoring = var.autoscaling_monitoring, - enroot = var.enroot, - pyxis = var.pyxis, - pam = var.pam, - sacct_limits = var.sacct_limits, - privilege_sudo = var.privilege_sudo, - privilege_group_name = var.privilege_group_name, - latency_check = var.latency_check, - private_deployment = var.private_deployment, - controller_username = var.controller_username, - compute_username = var.compute_username, - use_multiple_ads = var.use_multiple_ads, - use_compute_agent = var.use_compute_agent, - BIOS = var.BIOS, - IOMMU = var.IOMMU, - SMT = var.SMT, - virt_instr = var.virt_instr, - access_ctrl = var.access_ctrl, - numa_nodes_per_socket = var.numa_nodes_per_socket, - percentage_of_cores_enabled = var.percentage_of_cores_enabled, - healthchecks = var.healthchecks - }) + region = var.region, + tenancy_ocid = var.tenancy_ocid, + vcn_subnet = var.vcn_subnet, + vcn_id = local.vcn_id, + zone_name = local.zone_name, + dns_entries = var.dns_entries, + cluster_block_volume_size = var.cluster_block_volume_size, + cluster_block_volume_performance = var.cluster_block_volume_performance, + ssh_cidr = var.ssh_cidr, + use_cluster_nfs = var.use_cluster_nfs, + cluster_nfs_path = var.cluster_nfs_path, + home_nfs = var.home_nfs, + create_fss = var.create_fss, + home_fss = var.home_fss, + add_nfs = var.add_nfs, + nfs_target_path = var.nfs_target_path, + nfs_source_IP = local.nfs_source_IP, + nfs_source_path = var.nfs_source_path, + nfs_options = var.nfs_options, + localdisk = var.localdisk, + log_vol = var.log_vol, + redundancy = var.redundancy, + cluster_monitoring = var.cluster_monitoring, + hyperthreading = var.hyperthreading, + unsupported = var.unsupported, + autoscaling_monitoring = var.autoscaling_monitoring, + enroot = var.enroot, + pyxis = var.pyxis, + pam = var.pam, + sacct_limits = var.sacct_limits, + privilege_sudo = var.privilege_sudo, + privilege_group_name = var.privilege_group_name, + latency_check = var.latency_check, + private_deployment = var.private_deployment, + controller_username = var.controller_username, + compute_username = var.compute_username, + use_multiple_ads = var.use_multiple_ads, + use_compute_agent = var.use_compute_agent, + BIOS = var.BIOS, + IOMMU = var.IOMMU, + SMT = var.SMT, + virt_instr = var.virt_instr, + access_ctrl = var.access_ctrl, + numa_nodes_per_socket = var.numa_nodes_per_socket, + percentage_of_cores_enabled = var.percentage_of_cores_enabled, + healthchecks = var.healthchecks, + change_hostname = var.change_hostname, + hostname_convention = var.hostname_convention + }) - destination = "/opt/oci-hpc/conf/variables.tf" + destination = "/opt/oci-hpc/conf/variables.tf" connection { host = local.host_backup type = "ssh" @@ -416,7 +431,7 @@ resource "null_resource" "cluster_backup" { provisioner "file" { content = base64decode(var.api_user_key) - destination = "/opt/oci-hpc/autoscaling/credentials/key.initial" + destination = "/opt/oci-hpc/autoscaling/credentials/key.initial" connection { host = local.host_backup type = "ssh" @@ -432,7 +447,7 @@ resource "null_resource" "cluster_backup" { "chmod 755 /opt/oci-hpc/autoscaling/credentials/key.sh", "/opt/oci-hpc/autoscaling/credentials/key.sh /opt/oci-hpc/autoscaling/credentials/key.initial /opt/oci-hpc/autoscaling/credentials/key.pem > /opt/oci-hpc/autoscaling/credentials/key.log", "chmod 600 /opt/oci-hpc/autoscaling/credentials/key.pem", - "echo ${var.configure} > /tmp/configure.conf"] + "echo ${var.configure} > /tmp/configure.conf"] connection { host = local.host_backup type = "ssh" @@ -444,16 +459,16 @@ resource "null_resource" "cluster_backup" { resource "oci_dns_rrset" "rrset-backup" { - count = var.slurm_ha && var.dns_entries ? 1 : 0 + count = var.slurm_ha && var.dns_entries ? 1 : 0 zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id domain = "${var.slurm_ha ? oci_core_instance.backup[0].display_name : ""}.${local.zone_name}" rtype = "A" items { domain = "${var.slurm_ha ? oci_core_instance.backup[0].display_name : ""}.${local.zone_name}" rtype = "A" - rdata = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "" + rdata = var.slurm_ha ? oci_core_instance.backup[0].private_ip : "" ttl = 3600 } - scope = "PRIVATE" + scope = "PRIVATE" view_id = data.oci_dns_views.dns_views.views[0].id } \ No newline at end of file diff --git a/variables.tf b/variables.tf index fcee9d94..8eb5db18 100755 --- a/variables.tf +++ b/variables.tf @@ -1,98 +1,349 @@ -variable "region" {} -variable "tenancy_ocid" {} -variable "targetCompartment" {} -variable "ad" {} -variable "secondary_ad" { default = "" } -variable "third_ad" { default = "" } -variable "use_multiple_ads" { default = false } -variable "ssh_key" { } -variable "cluster_network" { default = true } -variable "compute_cluster" { default = false } -variable "compute_cluster_exists" { default = false } -variable "compute_cluster_id" { default = "" } -variable "compute_cluster_start_index" { default = 0 } -variable "use_custom_name" { default = false } -variable "cluster_name" { default = "" } -variable "controller_ad" {} -variable "controller_shape" { default = "VM.Standard2.4" } -variable "controller_object_storage_par" { default = true } -variable "custom_controller_image" { - type = string - default = "image.ocid" -} -variable "custom_login_image" { - type = string - default = "image.ocid" -} -variable "controller_boot_volume_size" {} -variable "controller_boot_volume_backup" {} -variable "controller_boot_volume_backup_type" {default = "INCREMENTAL"} -variable "controller_boot_volume_backup_period" {default = "ONE_DAY"} -variable "controller_boot_volume_backup_retention_seconds" {default = "7776000"} -variable "controller_boot_volume_backup_time_zone" {default = "REGIONAL_DATA_CENTER_TIME"} -variable "cluster_network_shape" { default = "BM.HPC2.36" } -variable "instance_pool_shape" { default = "VM.Standard2.4" } -variable "node_count" { default = 2 } -variable "boot_volume_size" { default = 50 } -variable "use_marketplace_image" { default = true} -variable "image" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } -variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } -variable "use_compute_agent" { default = true } -variable "unsupported_controller_image" { default = "" } -variable "unsupported_login_image" { default = "" } -variable "use_cluster_nfs" { default = true} -variable "use_scratch_nfs" { default = false } -variable "cluster_nfs_path" { default = "/nfs/cluster" } -variable "scratch_nfs_path" { default = "/nfs/scratch" } -variable "vcn_compartment" { default = ""} -variable "vcn_id" { default = ""} -variable "use_existing_vcn" { default = false} -variable "public_subnet_id" { default = ""} -variable "private_subnet_id" { default = ""} -variable "vcn_subnet" { default = "172.16.0.0/21" } -variable "public_subnet" { default = "172.16.0.0/24" } -variable "additional_subnet" { default = "172.16.1.0/24" } -variable "rdma_subnet" { default = "192.168.0.0/16" } -variable "private_subnet" { default = "172.16.4.0/22" } -variable "ssh_cidr" { default = "0.0.0.0/0" } -variable "slurm" { default = false } -variable "slurm_ha" { default = false } -variable "login_node" { default = true } -variable "login_ad" {default = ""} -variable "login_shape" { default = "VM.Standard2.4" } -variable "login_boot_volume_size" {default = 50} -variable "slurm_nfs" { default = false } -variable "rack_aware" { default = false } -variable "ldap" { default = true } -variable "spack" { default = false } -variable "controller_ocpus" { default = 2} -variable "controller_ocpus_denseIO_flex" { default = 8} -variable "instance_pool_ocpus" { default = 2} -variable "instance_pool_ocpus_denseIO_flex" { default = 8} -variable "instance_pool_memory" { default = 16 } -variable "instance_pool_custom_memory" { default = false } -variable "login_ocpus" { default = 2} -variable "login_ocpus_denseIO_flex" { default = 8} -variable "controller_memory" { default = 16 } -variable "controller_custom_memory" { default = false } -variable "login_memory" { default = 16 } -variable "login_custom_memory" { default = false } -variable "privilege_sudo" { default = true } -variable "privilege_group_name" { default = "privilege" } +variable "region" { + type = string +} +variable "tenancy_ocid" { + type = string +} +variable "targetCompartment" { + type = string +} +variable "ad" { + type = string +} +variable "secondary_ad" { + default = "" + type = string + } +variable "third_ad" { + default = "" + type = string + } +variable "use_multiple_ads" { + default = false + type = bool + } +variable "ssh_key" { + type = string +} +variable "cluster_network" { + default = true + type = bool + } +variable "compute_cluster" { + default = false + type = bool + } +variable "compute_cluster_exists" { + default = false + type = bool + } +variable "compute_cluster_id" { + default = "" + type = string + } +variable "compute_cluster_start_index" { + default = 0 + type = number + } +variable "use_custom_name" { + default = false + type = bool + } +variable "cluster_name" { + default = "" + type = string + } +variable "controller_ad" { + type = string +} +variable "controller_shape" { + default = "VM.Standard2.4" + type = string + } +variable "controller_object_storage_par" { + default = true + type = bool + } +variable "custom_controller_image" { + type = string + default = "image.ocid" +} +variable "custom_login_image" { + type = string + default = "image.ocid" +} +variable "custom_monitoring_image" { + type = string + default = "image.ocid" +} +variable "controller_boot_volume_size" { + type = number +} +variable "controller_boot_volume_backup" { + type = bool +} +variable "controller_boot_volume_backup_type" { + default = "INCREMENTAL" + type = string + } +variable "controller_boot_volume_backup_period" { + default = "ONE_DAY" + type = string + } +variable "controller_boot_volume_backup_retention_seconds" { + default = "7776000" + type = string + } +variable "controller_boot_volume_backup_time_zone" { + default = "REGIONAL_DATA_CENTER_TIME" + type = string + } +variable "cluster_network_shape" { + default = "BM.HPC2.36" + type = string + } +variable "instance_pool_shape" { + default = "VM.Standard2.4" + type = string + } +variable "node_count" { + default = 2 + type = number + } +variable "boot_volume_size" { + default = 50 + type = number + } +variable "use_marketplace_image" { + default = true + type = bool + } +variable "image" { + default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" + type = string + } +variable "image_ocid" { + default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" + type = string + } +variable "use_compute_agent" { + default = true + type = bool + } +variable "unsupported_controller_image" { + default = "" + type = string + } +variable "unsupported_login_image" { + default = "" + type = string + } +variable "unsupported_monitoring_image" { + default = "" + type = string + } +variable "use_cluster_nfs" { + default = true + type = bool + } +variable "use_scratch_nfs" { + default = false + type = bool + } +variable "cluster_nfs_path" { + default = "/nfs/cluster" + type = string + } +variable "scratch_nfs_path" { + default = "/nfs/scratch" + type = string + } +variable "vcn_compartment" { + default = "" + type = string + } +variable "vcn_id" { + default = "" + type = string + } +variable "use_existing_vcn" { + type = bool + default = false + } +variable "public_subnet_id" { + default = "" + type = string + } +variable "private_subnet_id" { + default = "" + type = string + } +variable "vcn_subnet" { + default = "172.16.0.0/21" + type = string + } +variable "public_subnet" { + default = "172.16.0.0/24" + type = string + } +variable "additional_subnet" { + default = "172.16.1.0/24" + type = string + } +variable "rdma_subnet" { + default = "192.168.0.0/16" + type = string + } +variable "private_subnet" { + default = "172.16.4.0/22" + type = string + } +variable "ssh_cidr" { + default = "0.0.0.0/0" + type = string + } +variable "slurm" { + default = false + type = bool + } +variable "slurm_ha" { + default = false + type = bool + } +variable "login_node" { + default = true + type = bool + } +variable "login_ad" { + default = "" + type = string + } +variable "login_shape" { + default = "VM.Standard2.4" + type = string + } +variable "login_boot_volume_size" { + default = 50 + type = number + } +variable "monitoring_node" { + default = false + type = bool + } +variable "monitoring_ad" { + default = "" + type = string + } +variable "monitoring_shape" { + default = "VM.Standard2.4" + type = string + } +variable "monitoring_boot_volume_size" { + default = 50 + type = number + } +variable "slurm_nfs" { + default = false + type = bool + } +variable "rack_aware" { + default = false + type = bool + } +variable "ldap" { + default = true + type = bool + } +variable "spack" { + default = false + type = bool + } +variable "controller_ocpus" { + default = 2 + type = number + } +variable "controller_ocpus_denseIO_flex" { + default = 8 + type = number + } +variable "instance_pool_ocpus" { + default = 2 + type = number + } +variable "instance_pool_ocpus_denseIO_flex" { + default = 8 + type = number + } +variable "instance_pool_memory" { + default = 16 + type = number + } +variable "instance_pool_custom_memory" { + default = false + type = bool + } +variable "login_ocpus" { + default = 2 + type = number + } +variable "login_ocpus_denseIO_flex" { + default = 8 + type = number + } +variable "monitoring_ocpus" { + default = 2 + type = number + } +variable "monitoring_ocpus_denseIO_flex" { + default = 8 + type = number + } +variable "controller_memory" { + default = 16 + type = number + } +variable "controller_custom_memory" { + default = false + type = bool + } +variable "login_memory" { + default = 16 + type = number + } +variable "login_custom_memory" { + default = false + type = bool + } +variable "monitoring_memory" { + default = 16 + type = number + } +variable "monitoring_custom_memory" { + default = false + type = bool + } +variable "privilege_sudo" { + default = true + type = bool + } +variable "privilege_group_name" { + default = "privilege" + type = string + } -variable "marketplace_listing" { - default = "HPC_OL7" -} -variable "marketplace_version_id" { - type = map(string) + +variable "marketplace_listing" { + default = "HPC_OL8" + type = string +} +variable "marketplace_version_id" { + type = map(string) default = { - "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.05.08-0" - "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-2024.05.08-0" - "GPU_OL8_NV550" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-550-CUDA-12.4-2024.05.08-0" - "GPU_OL7_NV550" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-550-CUDA-12.4-2024.05.13-0" - "GPU_OL8_NV535" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.05.08-0" - "GPU_OL7_NV535" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.05.13-0" + "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.09.18-0" + "GPU_OL8_NV560" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-560-CUDA-12.6-2024.09.18-0" + "GPU_OL8_NV550" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-550-CUDA-12.4-2024.09.18-0" + "GPU_OL8_NV535" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.09.18-0" } } @@ -100,184 +351,345 @@ variable "marketplace_version_id" { # oci compute pic listing list --display-name "Oracle Linux 7 - HPC Cluster Networking Image" variable "marketplace_listing_id_HPC" { - default = "ocid1.appcataloglisting.oc1..aaaaaaaahz2xiwfcsbebmqg7sp6lhdt6r2vsjro5jfukkl5cntlqvfhkbzaq" + default = "ocid1.appcataloglisting.oc1..aaaaaaaahz2xiwfcsbebmqg7sp6lhdt6r2vsjro5jfukkl5cntlqvfhkbzaq" + type = string } variable "marketplace_listing_id_GPU" { - default = "ocid1.appcataloglisting.oc1..aaaaaaaab2hkpxsglxfbzitiiqv6djxzj5q5soxotwdem2dd2kbifgk4p55q" + default = "ocid1.appcataloglisting.oc1..aaaaaaaab2hkpxsglxfbzitiiqv6djxzj5q5soxotwdem2dd2kbifgk4p55q" + type = string } -variable "controller_block_volume_performance" { -/* +variable "controller_block_volume_performance" { + /* Allowed values "0. Lower performance" "10. Balanced performance" "20. High Performance" -*/ +*/ -default = "10. Balanced performance" + default = "10. Balanced performance" + type = string } -variable "controller_block" { +variable "controller_block" { default = false -} + type = bool +} -variable "controller_block_volume_size" { +variable "controller_block_volume_size" { default = 1000 + type = number } -variable "login_block_volume_performance" { -/* +variable "login_block_volume_performance" { + /* Allowed values "0. Lower performance" "10. Balanced performance" "20. High Performance" -*/ - -default = "10. Balanced performance" +*/ + default = "10. Balanced performance" + type = string } - -variable "login_block" { +variable "login_block" { default = false -} + type = bool +} -variable "login_block_volume_size" { +variable "login_block_volume_size" { default = 1000 + type = number } -variable "scratch_nfs_type_cluster" { default = "nvme"} -variable "scratch_nfs_type_pool" { default = "none" } -variable "cluster_block_volume_size" { default = "1000" } -variable "cluster_block_volume_performance" { default = "10. Balanced performance"} +variable "scratch_nfs_type_cluster" { + default = "nvme" + type = string + } +variable "scratch_nfs_type_pool" { + default = "none" + type = string + } +variable "cluster_block_volume_size" { + default = "1000" + type = string + } +variable "cluster_block_volume_performance" { + default = "10. Balanced performance" + type = string + } -variable "inst_prin" { default = true} -variable "api_user_key" { default = ""} -variable "api_fingerprint" { default = ""} -variable "api_user_ocid" { default = ""} -variable "home_nfs" { default = true } -variable "home_fss" { default = false } -variable "configure" { default = true } +variable "inst_prin" { + default = true + type = bool + } +variable "api_user_key" { + default = "" + type = string + } +variable "api_fingerprint" { + default = "" + type = string + } +variable "api_user_ocid" { + default = "" + type = string + } +variable "home_nfs" { + default = true + type = bool + } +variable "home_fss" { + default = false + type = bool + } +variable "configure" { + default = true + type = bool + } -variable "hyperthreading" { default = true } +variable "hyperthreading" { + default = true + type = bool + } -variable "autoscaling" { default = false } -variable "latency_check" { default = true } -variable "add_nfs" { default = false} -variable "create_fss" { default = false } -variable "fss_compartment" {default = ""} -variable "fss_ad" {default = ""} -variable "nfs_target_path" { default = "/app"} -variable "nfs_source_IP" { default = ""} -variable "nfs_source_path" { default = "/app"} -variable "nfs_options" {default = ""} -variable "monitoring" { default = true } -variable "enroot" { default = false } -variable "pyxis" { default = false } -variable "pam" { default = false } -variable "sacct_limits" { default = false } +variable "autoscaling" { + default = false + type = bool + } +variable "latency_check" { + default = true + type = bool + } +variable "add_nfs" { + default = false + type = bool + } +variable "create_fss" { + default = false + type = bool + } +variable "mount_target_count" { + default = 1 + type = number + } +variable "fss_compartment" { + default = "" + type = string + } +variable "fss_ad" { + default = "" + type = string + } +variable "nfs_target_path" { + default = "/app" + type = string + } +variable "nfs_source_IP" { + default = "" + type = string + } +variable "nfs_list_of_mount_target_IPs" { + default = "" + type = string + } +variable "nfs_source_path" { + default = "/app" + type = string + } +variable "nfs_options" { + default = "" + type = string + } +variable "enroot" { + default = false + type = bool + } +variable "cluster_monitoring" { + default = false + type = bool + } +variable "pyxis" { + default = false + type = bool + } +variable "pam" { + default = false + type = bool + } +variable "sacct_limits" { + default = false + type = bool + } -variable "unsupported" { - type=bool +variable "unsupported" { + type = bool default = false -} +} -variable "queue" {default = "compute"} -variable "unsupported_controller" { - type=bool - default = false +variable "queue" { + default = "compute" + type = string + } +variable "unsupported_controller" { + type = bool + default = false } -variable "use_marketplace_image_controller" { - type=bool - default = true +variable "use_marketplace_image_controller" { + type = bool + default = true } -variable "unsupported_login" { - type=bool - default = false +variable "unsupported_login" { + type = bool + default = false +} +variable "unsupported_monitoring" { + type = bool + default = false +} +variable "controller_username" { + type = string + default = "opc" } -variable "controller_username" { - type = string - default = "opc" -} - -variable "compute_username" { - type = string - default = "opc" -} -variable "login_username" { - type = string - default = "opc" -} -variable "autoscaling_monitoring" { - type= bool +variable "compute_username" { + type = string + default = "opc" +} +variable "login_username" { + type = string + default = "opc" +} +variable "monitoring_username" { + type = string + default = "opc" +} +variable "autoscaling_monitoring" { + type = bool default = false -} +} -variable "autoscaling_mysql_service" { - type= bool +variable "autoscaling_mysql_service" { + type = bool default = false -} +} -variable "monitoring_shape_name" { - type = string +variable "monitoring_shape_name" { + type = string default = "MySQL.VM.Standard.E3.1.16GB" -} +} -variable "admin_username" { - type = string +variable "admin_username" { + type = string default = "admin" -} +} -variable "admin_password" { - type = string +variable "admin_password" { + type = string default = "Monitor1234!" } -variable scratch_nfs_mount { default = ""} -variable scratch_nfs_export {default = ""} -variable cluster_nfs_mount {default = ""} -variable cluster_nfs_export {default = ""} +variable "scratch_nfs_mount" { + default = "" + type = string + } +variable "scratch_nfs_export" { + + default = "" + type = string + } +variable "cluster_nfs_mount" { + default = "" + type = string + } +variable "cluster_nfs_export" { + default = "" + type = string + } -variable "private_deployment" { default = false } +variable "private_deployment" { + default = false + type = bool + } -variable "localdisk" { default = true } -variable "log_vol" { default = false } -variable "redundancy" { default = true } +variable "localdisk" { + default = true + type = bool + } +variable "log_vol" { + default = false + type = bool + } +variable "redundancy" { + default = true + type = bool + } -variable "use_marketplace_image_login" { default = true} +variable "use_marketplace_image_login" { + default = true + type = bool + } +variable "use_marketplace_image_monitoring" { + default = true + type = bool + } -variable "marketplace_listing_login" { - default = "HPC_OL7" -} -variable "marketplace_listing_controller" { - default = "HPC_OL7" -} +variable "marketplace_listing_login" { + default = "HPC_OL8" + type = string +} +variable "marketplace_listing_monitoring" { + default = "HPC_OL8" + type = string +} +variable "marketplace_listing_controller" { + default = "HPC_OL8" + type = string +} variable "zone_name" { default = "" + type = string } variable "dns_entries" { default = true + type = bool } variable "healthchecks" { default = true + type = bool } variable "BIOS" { default = false + type = bool } variable "IOMMU" { default = false + type = bool } variable "SMT" { default = true + type = bool } variable "virt_instr" { default = false + type = bool } variable "access_ctrl" { default = false + type = bool } variable "numa_nodes_per_socket" { default = "Default" + type = string } variable "percentage_of_cores_enabled" { default = "Default" -} \ No newline at end of file + type = string +} +variable "change_hostname" { + default = false + type = bool +} +variable "hostname_convention" { + default = "GPU" + type = string +} diff --git a/versions.tf b/versions.tf index ec66572c..02907399 100755 --- a/versions.tf +++ b/versions.tf @@ -1,9 +1,29 @@ terraform { - required_version = ">= 1.0" + required_version = ">= 1.2" required_providers { - oci = { - source = "oracle/oci" - version = "5.37.0" - } + oci = { + source = "oracle/oci" + version = ">= 6.9.0" + } + local = { + source = "hashicorp/local" + version = ">= 2.1.0" + } + tls = { + source = "hashicorp/tls" + version = ">= 3.0.0" + } + random = { + source = "hashicorp/random" + version = ">= 3.0.0" + } + null = { + source = "hashicorp/null" + version = ">= 3.0.0" + } + template = { + source = "hashicorp/template" + version = ">= 2.2.0" + } } } \ No newline at end of file