Skip to content

Commit

Permalink
longhorn: better alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
paulfantom committed Dec 1, 2024
1 parent 87e3fbd commit b85a951
Showing 1 changed file with 27 additions and 21 deletions.
48 changes: 27 additions & 21 deletions base/longhorn-system/prometheusrule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,53 +9,62 @@ spec:
rules:
- alert: LonghornVolumeActualSpaceUsedWarning
annotations:
description: The actual space used by Longhorn volume {{$labels.volume}} on {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes.
description: The actual space used by Longhorn volume {{$labels.volume}} on {{$labels.node}} is at {{$value}}% capacity for more than 5 minutes.
summary: The actual used space of Longhorn volume is over 90% of the capacity.
expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) * 100 > 90
for: 5m
labels:
issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
severity: warning
- alert: LonghornVolumeStatusCritical
annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
more than 2 minutes.
summary: Longhorn volume {{$labels.volume}} is Fault
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is at Fault for more than 2 minutes.
summary: Longhorn volume is at Fault
expr: longhorn_volume_robustness == 3
for: 5m
labels:
issue: Longhorn volume {{$labels.volume}} is Fault.
severity: critical
- alert: LonghornVolumeStatusWarning
annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
more than 5 minutes.
summary: Longhorn volume {{$labels.volume}} is Degraded
description: Longhorn volume {{$labels.volume}} responsible for PVC {{$labels.pvc}} in namespace {{$labels.pvc_namespace}} on {{$labels.node}} is Degraded for more than 5 minutes.
summary: Longhorn volume is Degraded
expr: longhorn_volume_robustness == 2
for: 5m
labels:
issue: Longhorn volume {{$labels.volume}} is Degraded.
severity: warning
- alert: LonghornVolumeOverprovisioned
annotations:
description: PVC {{$labels.persistentvolumeclaim}} in namespace {{$labels.namespace}} on node {{$labels.node}} is overprovisioned by a factor of {{$value | humanizePercent}}. Run fstrim if possible.
summary: Longhorn volume overprovisioned
expr: |
sum by (persistentvolumeclaim, namespace, node) (kubelet_volume_stats_used_bytes)
/
sum by (persistentvolumeclaim, namespace, node) (
label_replace(
label_replace(
longhorn_volume_actual_size_bytes, "persistentvolumeclaim", "$1", "pvc", "(.+)"
),
"namespace", "$1", "pvc_namespace", "(.+)"
)
)
> 1.1
for: 6h
labels:
severity: warning
- alert: LonghornNodeStorageWarning
annotations:
description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes.
description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for more than 5 minutes.
summary: The used storage of node is over 85% of the capacity.
expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 85
for: 5m
labels:
issue: The used storage of node {{$labels.node}} is high.
severity: warning
- alert: LonghornDiskStorageWarning
annotations:
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes.
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for more than 5 minutes.
summary: The used storage of disk is over 85% of the capacity.
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 85
for: 5m
labels:
issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
severity: warning
- alert: LonghornNodeDown
annotations:
Expand All @@ -64,17 +73,15 @@ spec:
expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
for: 5m
labels:
issue: There are {{$value}} Longhorn nodes are offline
severity: warning
- alert: LonghornIntanceManagerCPUUsageWarning
annotations:
description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
more than 5 minutes.
summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
summary: Longhorn instance manager has a ratio of CPU Usage to request at over 300%.
expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
for: 5m
labels:
issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
severity: warning
- alert: LonghornNodeCPUUsageWarning
annotations:
Expand All @@ -84,7 +91,6 @@ spec:
expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
for: 5m
labels:
issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
severity: warning
- alert: LonghornVolumeBackupStuck
expr: count by (volume) (longhorn_backup_state < 2)
Expand Down

0 comments on commit b85a951

Please sign in to comment.